index.html

<html lang="en" class="">

<head>
  <!-- Required meta tags -->
  <title>ANetQA</title>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
  <meta name="keywords" content="video, reasoning, physics, deep learning, computer vision, machine learning">
  <meta name="description"
    content="ANetQA: A Large-scale Benchmark for Fine-grained Compositional Reasoning over Untrimmed Videos">

  <!-- Bootstrap CSS -->
  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css">
  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css">

  <script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js"></script>
  <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js"></script>
  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-81724582-4"></script>
  <script>
    window.dataLayer = window.dataLayer || [];
    function gtag() { dataLayer.push(arguments) };
    gtag('js', new Date());
    gtag('config', 'UA-81724582-4');
  </script>

  <style>
    body {
      font-size: 16px
    }

    .navbar-fixed-top {
      min-height: 60px;
    }

    .navbar-nav {
      margin-left: -20px;
    }

    .navbar-nav>li>a {
      padding-top: 10px;
      padding-bottom: 0px;
      line-height: 60px;
      font-size: 22px;
      color: gray;
    }

    .navbar-nav>li>a:active {
      color: black;
    }

    .navbar-nav>li>a:hover {
      color: black;
      -webkit-tap-highlight-color: rgba(0, 0, 0, 0);
      -webkit-tap-highlight-color: transparent;
      outline: none;
      background: none;
      text-decoration: none;
    }

    .col-md-13 {
      display: inline-block;
      text-align: justify;
    }

    .anet_logo {
      margin-top: -30px;
      margin-right: 30px;
      float: left;
    }

    .logo {
      margin-top: -80px;
      margin-right: 30px;
      float: right;
    }

    .logo1 {
      margin-right: 30px;
      float: right;
    }

    .logo2 {
      margin-right: 30px;
      float: right;
    }

    .logo3 {
      margin-top: 20px;
      margin-right: 30px;
      float: right;
    }

    .img1 {
      float: left
    }

    .img2 {
      float: left
    }

    .sg {
      margin-right: 20%;
      float: right
    }

    .qa {
      float: left
    }
  </style>
  <!-- Custom styles for this template -->
  <link href="jumbotron.css" rel="stylesheet">
  <style>
    hcfy-result.__hcfy__result__loaded__.__hcfy__result__both__ {
      border: 1px dotted
    }
  </style>
</head>



<body data-gr-c-s-loaded="true">
  <nav class="navbar-fixed-top" style="background-color: rgb(255,255,255)">
    <!-- <a class="navbar-brand" href="#" style="font-size: 25px; color:white;line-height: 100%;">ANetQA</a> -->
    <div class="anet_logo">
      <img src="logo.jpg" height="125" width="125">
    </div>

    <div class="collapse navbar-collapse" id="navbarsExampleDefault">
      <ul class="nav navbar-nav mr-auto">
        <li><a href="#Paper" style="font-size: 20px">Paper</a></li>
        <li><a href="#Dataset" style="font-size: 20px">Dataset</a></li>
        <li><a href="#Code" style="font-size: 20px">Code</a></li>
        <li><a href="#Eval" style="font-size: 20px">Evaluation</a></li>
        <li><a href="#Licence" style="font-size: 20px">Licence</a></li>
      </ul>
    </div>
    <div class="logo">
      <div class="logo3">
        <a href="https://research.lenovo.com/webapp/view/index.html"><img class="img-responsive img-rounded"
            src="lenovo_logo.png" height="30" width="150"></a>
      </div>
      <div class="logo1">
        <a href="https://www.zju.edu.cn/"><img class="img-responsive img-rounded" src="zju_logo.png" height="75"
            width="75"></a>
      </div>
      <div class="logo2">
        <a href="https://www.hdu.edu.cn"><img class="img-responsive img-rounded" src="hdu_logo.jpg" height="75"
            width="75"></a>
      </div>
    </div>

  </nav>



  <main role="main">
    <div class="container" style="padding-top: 80px; font-size: 20px">
      <div align="center">
        <h1 class="text-center" aligh="center">
          ANetQA: A Large-scale Benchmark for Fine-grained
          <br>
          Compositional Reasoning over Untrimmed Videos
        </h1><br>


        <b>Zhou Yu<sup>1</sup></b> &nbsp;&nbsp;&nbsp;
        <b>Lixiang Zheng<sup>1</sup></b> &nbsp;&nbsp;&nbsp;
        <b>Zhou Zhao<sup>2</sup></b> &nbsp;&nbsp;&nbsp;
        <b>Fei Wu<sup>2</sup></b> &nbsp;&nbsp;&nbsp;
        <b>Jianping Fan<sup>1,3</sup></b> &nbsp;&nbsp;&nbsp;
        <b>Kui Ren<sup>4</sup></b></a> &nbsp;&nbsp;&nbsp;
        <b>Jun Yu<sup>1*</sup></b></a>

        <br><br>
        <h5><sup>1</sup>School of Computer Science, Hangzhou Dianzi University, China</h5>
        <h5><sup>2</sup>Colledge of Computer Science and Technology, Zhejiang University, China</h5>
        <h5><sup>3</sup>AI Lab at Lenovo Research, China</h5>
        <h5><sup>4</sup>School of Cyber Science and Technology, Zhejiang University, China</h5>
        <h5><sup>*</sup>Corresponding author</h5>
      </div>

    </div>
    <br><br>
    <div class="container">
      <h2 id="RFSleep" style="padding-top: 80px; margin-top: -80px;">Abstract</h2>
      <hr>
      <div class="row">
        <!-- <div class="col-md-10 col-md-offset-1"> -->
        <div class="col-md-13" style="margin-left: 20px;margin-right: 20px;">
          Building benchmarks to systemically analyze different
          capabilities of video question answering (VideoQA) models
          is challenging yet crucial. Existing benchmarks often
          use non-compositional simple questions and suffer from
          language biases, making it difficult to diagnose model
          weaknesses incisively. A recent benchmark AGQA poses
          a promising paradigm to generate QA pairs automatically
          from pre-annotated scene graphs, enabling it to measure
          diverse reasoning abilities with granular control. However,
          its questions have limitations in reasoning about the finegrained
          semantics in videos as such information is absent
          in its scene graphs. To this end, we present ANetQA, a
          large-scale benchmark that supports fine-grained compositional
          reasoning over the challenging untrimmed videos
          from ActivityNet. Similar to AGQA, the QA pairs
          in ANetQA are automatically generated from annotated
          video scene graphs. The fine-grained properties of ANetQA
          are reflected in the following: (i) untrimmed videos with
          fine-grained semantics; (ii) spatio-temporal scene graphs
          with fine-grained taxonomies; and (iii) diverse questions
          generated from fine-grained templates. ANetQA attains 1.4
          billion unbalanced and 13.4 million balanced QA pairs,
          which is an order of magnitude larger than AGQA with
          a similar number of videos. Comprehensive experiments
          are performed for state-of-the-art methods. The best model
          achieves 44.5% accuracy while human performance tops
          out at 84.5%, leaving sufficient room for improvements.</div>
        <div style="margin-top: 20px">
          <!-- <div class="col-md-10 col-md-offset-1"> -->
          <div style="margin-left: 20px">
            <img class="img-responsive img-rounded img1" src="anetqaex.jpg" alt="">
          </div>
        </div>
      </div>
      <br><br>

      <div class="container">
        <h2 id="Paper" style="padding-top: 80px; margin-top: -80px;">Paper</h2>
        <hr>
        <div class="row">
          <div class="col-md-9">
            <a href="https://arxiv.org/abs/2305.02519"><b>
                ANetQA: A Large-scale Benchmark for Fine-grained Compositional Reasoning over Untrimmed Videos
              </b><br></a>

            Zhou Yu, Lixiang Zheng, Zhou Zhao, Fei Wu, Jianping Fan, Kui Ren, Jun Yu <br>
            <i> CVPR, 2023</i><br>
            <a href="https://arxiv.org/abs/2305.02519">[PDF]</a>
          </div>
        </div>
      </div><br><br>


      <!-- -->
      <div class="container">
        <h2 id="Dataset" style="padding-top: 80px; margin-top: -80px;">Dataset</h2>
        <hr>
        <h3>Videos</h3>
        <div class="row">
          <div class="col-md-9">
            <li>Raw videos from <a href="http://activity-net.org/download.html" class="download-link">ActivityNet
                v1.3</a></li>
          </div>
        </div>
        <div class="row">
          <div class="col-md-9">
            <li><a
                href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EYIaBMbntepBt2tiG7USPO8Byi3ap-MkltQNdtUh9vZ2_w?download=1"
                class="download-link">Meta information</a> of all videos.</li>
          </div>
        </div>
        <hr>
        <div class="sg">
          <h3>Scene Graphs</h3>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/ESVPcVYJlXZAlD5IkD_9y80BYcsfpC7Gp9LJGfXJYqreSw?download=1"
                  class="download-link">Train scene graphs</a> from 9,155 videos.</li>
            </div>
          </div>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/Ea2UrDyFRTNNisb3BtJj6eAB5uoI1A7ewXam7OesPpFqzg?download=1"
                  class="download-link">Val scene graphs</a> from 1,185 videos.</li>
            </div>
          </div>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/ESY5fUliVkZOo90ys6MdOGEBmKL8cpkV9kiwTKW0sAvsIQ?download=1"
                  class="download-link">Meta information</a> of all scene graphs.</li>
            </div>
          </div>
        </div>
        <div class="qa">
          <h3>Question-Answer Pairs</h3>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EbOk0tgkpZlIqwsW1yVq7PgB2jJUx0x0eCv5iu73hl--uQ?download=1"
                  class="download-link">Train QA pairs</a> (10,456,011 samples)</li>
            </div>
          </div>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EckH2khIKF9PqX4lWwN2uJ0Bn_jA1Qvzv08Ny9jxEuzfWw?download=1"
                  class="download-link">Val QA pairs</a> (1,474,723 samples)</li>
            </div>
          </div>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EdIUQBXaNXVMm8-Kmvg3gjcBHmnaeE5s4OP8OO5Ics7URA?download=1"
                  class="download-link">Test questions</a> (1,503,510 samples)</li>
            </div>
          </div>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/ERBudANdq9JAgXHBw_YsnZsBso6nskwG9FxyShhxhSQ3Tg?download=1"
                  class="download-link">Test-dev questions</a> (300,694 samples)</li>
            </div>
          </div>
          <div class="row">
            <div class="col-md-12">
              <li><a
                  href="https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EYiwgEwCXu5Apcoj3I-Y3ewBBHPb0N24po5q13Ep9BXPFA?download=1"
                  class="download-link">Test-tiny questions</a> (20,000 samples)</li>
                  *The <i>test-dev</i> and <i>test-tiny</i> splits are two subsets of the <i>test</i> split.
            </div>
          </div>
          <div class="row">
            <div class="col-md-9" style="margin-top:20px; width: 500px; ">
              More details of the dataset are provided <a
                href="https://github.com/MILVLG/anetqa-code/tree/main/dataset">here</a>.
            </div>
          </div>
        </div>
      </div>
        <br><br>

        <div class="container">
          <h2 id="Code" style="padding-top: 80px; margin-top: -80px;">Code</h2>
          <hr>
          <div class="row">
            <div class="col-md-9">
              Code for ANetQA baseline models are available <a href="https://github.com/MILVLG/anetqa-code">here</a>.
            </div>
          </div>
        </div><br><br>

        <div class="container">
          <h2 id="Eval" style="padding-top: 80px; margin-top: -80px;">Evaluation</h2>
          <hr>
          <div class="row">
            <div class="col-md-9">
              Evaluation for the testing set is provided on the online <a href="https://eval.ai/web/challenges/challenge-page/2226/overview">EvalAI server</a>.
            </div>
          </div>
          <h3>Submit Format</h3>
          <p style="font-size:15px; font-weight: 200; border-style: solid;
          border-width: 1px; text-align:justify;">
            <code style="background-color: #fff;">
                [...<br>
                &nbsp;&nbsp;&nbsp;&nbsp;{<br>
                  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"question_id": question_id,<br>
                  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"answer": answer<br>
                  &nbsp;&nbsp;&nbsp;&nbsp;},<br>
              ...]
          </code>
          </p>
          We have provided an example result JSON file <a
            href="https://github.com/MILVLG/anetqa-code/blob/main/dataset/fake_res.json">here</a>.
        </div><br><br>

        <div class="container">
          <h2 id="Licence" style="padding-top: 80px; margin-top: -80px;">Licence</h2>
          <hr>
          <div class="row">
            <div class="col-md-9">
              The annotations in this dataset belong to the ANetQA Team and are licensed under a <a href="https://creativecommons.org/licenses/by-nc/4.0/deed.en">CC BY-NC 4.0 </a>License.
            </div>
          </div>
        </div><br><br>


        <div class="container">
          <h2 id="More" style="padding-top: 80px; margin-top: -80px;">Bibtex</h2>
          <hr>

          <div class="row">
            <pre style="font-size:12px;margin-left: 15px;margin-right: 15px;">
@inproceedings{yu2023anetqa,
&nbsp;&nbsp;&nbsp;title={ANetQA: A Large-scale Benchmark for Fine-grained Compositional Reasoning over Untrimmed Videos},
&nbsp;&nbsp;&nbsp;author={Yu, Zhou and Zheng, Lixiang and Zhao, Zhou and Wu, Fei and Fan, Jianping and Ren, Kui and Yu, Jun},
&nbsp;&nbsp;&nbsp;booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
&nbsp;&nbsp;&nbsp;pages={23191--23200}
&nbsp;&nbsp;&nbsp;year={2023}
}</pre>
          </div>
        </div><br><br>

      </div>
  </main>




</body>
<div style="all: initial;">
  <div id="__hcfy__" style="all: initial;"></div>
</div>

</html>