index.html

<!DOCTYPE html>
<html>
  <style type="text/css">
    @font-face {
      font-family: "Palatino";
      src: url(./static/font/Palatino.ttc) format('truetype');
    }
    body,td,th,tr,p,a,ul,li,span {
      font-family: "Palatino", "TeXGyrePagella", "Palatino Linotype", "Book Antiqua", serif; /*'Lato', Verdana, Helvetica, sans-serif;*/
      font-size: 20px; /*14*/
      line-height: 130%;
      }
    h1 {
      font-family: "TeXGyrePagella", "Palatino Linotype", "Book Antiqua", Palatino, serif; /*'Lato', Verdana, Helvetica, sans-serif;*/
      font-size: 25px;
    }
    h2 {
      font-family: "TeXGyrePagella", "Palatino Linotype", "Book Antiqua", Palatino, serif; /*'Lato', Verdana, Helvetica, sans-serif;*/
      font-size: 22px;
      color: #ed6861; /*#fc4a1a;*/  /*#e37222;*/
    }
    a {
    color: #ca3fec; /*#1772d0;*/
    text-decoration:none;
    }
    a:focus, a:hover {
    color: #e37222; /*#f7b733;*/ /*f09228;*/
    text-decoration:none;
    }
    @font-face {
      font-family: "Palatino";
      src: url(./static/font/Palatino.ttc) format('truetype');
    }
    ul,OL {
      padding-left: 50px;
    }
  </style>
<head>
  <!-- <style>
    body {
      font-family: "Palatino", sans-serif;
    }
  </style> -->
  <meta charset="utf-8">
  <meta name="description"
        content="LoHoRavens: A Long-Horizon Language-Conditioned Benchmark for Robotic Tabletop Rearrangement.">
  <meta name="keywords" content="Long-Horizon Language-Conditioned Manipulation, Large Language Models, Vision-Langage Models">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>LoHoRavens</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-4Y34PZ3XBE"></script>
  <script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){dataLayer.push(arguments);}
    gtag('js', new Date());

    gtag('config', 'G-4Y34PZ3XBE');
  </script>

  <script>
    function updateSingleVideo() {
      var demo = document.getElementById("single-menu-demos").value;
      var task = document.getElementById("single-menu-tasks").value;
      var inst = document.getElementById("single-menu-instances").value;

      console.log("single", demo, task, inst)

      var video = document.getElementById("single-task-result-video");
      video.src = "https://cliport.github.io/media/results_web/" + 
                  task + 
                  "-two_stream_full_clip_lingunet_lat_transporter-n" + 
                  demo + 
                  "-train/videos/" + 
                  task +
                  "-0000" + 
                  inst + 
                  ".mp4";
      video.playbackRate = 2.0;
      video.play();
    }

    function updateMultiVideo() {
      var demo = document.getElementById("multi-menu-demos").value;
      var task = document.getElementById("multi-menu-tasks").value;
      var inst = document.getElementById("multi-menu-instances").value;

      console.log("multi", demo, task, inst)

      var video = document.getElementById("multi-task-result-video");
      video.src = "https://cliport.github.io/media/results_web/" + 
                  task + 
                  "-two_stream_full_clip_lingunet_lat_transporter-n" + 
                  demo + 
                  "-train/videos/multi-language-conditioned-" + 
                  task +
                  "-0000" + 
                  inst + 
                  ".mp4";
      video.playbackRate = 2.0;
      video.play();
    }

  </script>


  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>
  <style>
    * {
      font-family: "TeXGyrePagella", "Palatino Linotype", "Google+Sans", "Noto+Sans", "Castoro", Palatino, serif;
    }
</style>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-2 publication-title">LoHoRavens: A Long-Horizon Language-Conditioned Benchmark for Robotic Tabletop Rearrangement</h1>
          <!-- <h3 class="title is-4 conference-authors"><a target="_blank" href="https://www.robot-learning.org/">CoRL 2021</a></h3> -->
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a target="_blank" href="https://shengqiang-zhang.github.io/">Shengqiang Zhang</a><sup>1,4</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://www.phil-wicke.com/">Philipp Wicke</a><sup>1,4</sup>,</span><br>
            <span class="author-block">
              <a target="_blank">Haihui Ye</a><sup>3</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.ch/citations?user=w5ePE1oAAAAJ&hl=en">Lütfi Kerem Şenel</a><sup>1,4</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.de/citations?user=eIz0XvMAAAAJ&hl=en">Zhenshan Bing</a><sup>3</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.com/citations?user=ppZN58sAAAAJ&hl=en">Luis Figueredo</a><sup>2</sup>,</span><br>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.com/citations?user=IBzbBbwAAAAJ&hl=en">Abdeldjallil Naceri</a><sup>2</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.de/citations?user=-CA8QgwAAAAJ&hl=en">Alois Knoll</a><sup>3</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.de/citations?user=H1v0ztEAAAAJ&hl=de">Sami Haddadin</a><sup>2</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://bplank.github.io/">Barbara Plank</a><sup>1,4</sup>,</span>
            <span class="author-block">
              <a target="_blank" href="https://scholar.google.com/citations?user=qIL9dWUAAAAJ&hl=en">Hinrich Schütze</a><sup>1,4</sup></span>
            
            
            <!-- <span class="author-block">
              <a target="_blank" href="https://homes.cs.washington.edu/~fox/">Dieter Fox</a><sup>1, 2</sup>
            </span> -->
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>CIS, LMU Munich</span>
            <span class="author-block"><sup>&nbsp;2</sup>RSI, MIRMI, TUM</span><br>
            <span class="author-block"><sup>3</sup>Informatics, TUM</span>
            <span class="author-block"><sup>&nbsp;4</sup>Munich Center for Machine Learning (MCML)</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a target="_blank" href=""
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
<!--               <span class="link-block">
                <a target="_blank" href="https://drive.google.com/file/d/1xzG5e1XF958HPuD_FZTiKROd9AQyd1fS/view?usp=sharing"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span> -->
              <!-- Video Link. -->
              <span class="link-block">
                <a target="_blank" href="https://youtu.be/sguEFlVdEUA"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-youtube"></i>
                  </span>
                  <span>Video</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a target="_blank" href="https://github.com/Shengqiang-Zhang/LoHo-Ravens"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- <section class="hero teaser">
  <div class="container is-fullhd">
    <div class="hero-body">
      <video id="teaser" autoplay muted loop height="100%">
        <source src="https://cliport.github.io/media/videos/10sim_web_teaser.mp4"
                type="video/mp4">
      </video>
      <h2 class="subtitle has-text-centered">
      </br>
        <span class="dcliport">CLIPort</span> is an end-to-end imitation-learning agent that can learn a single language-conditioned policy for various tabletop tasks.
      </h2>
    </div>
  </div>
</section> -->


<!-- <section class="hero is-light is-small">
  <div class="hero-body">
    <div class="container">
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-steve">
          <video poster="" id="steve" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/1_folding.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-fullbody">
          <video poster="" id="fullbody" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/4_chess.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-shiba">
          <video poster="" id="shiba" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/3_packing.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-blueshirt">
          <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/6_sweeping.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-shiba">
          <video poster="" id="shiba" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/9_cherry.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-shiba">
          <video poster="" id="shiba" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/7_reading.mp4"
                    type="video/mp4">
          </video>
        </div>
       <div class="item item-chair-tp">
          <video poster="" id="chair-tp" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/2_bowl.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-blueshirt">
          <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/8_rope.mp4"
                    type="video/mp4">
          </video>
        </div>
        <div class="item item-blueshirt">
          <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
            <source src="https://cliport.github.io/media/videos/5_stacking.mp4"
                    type="video/mp4">
          </video>
        </div>
      </div>
    </div>
  </div>
</section>
<h2 class="subtitle has-text-centered">
</br>
  We learn <b>one multi-task policy</b> for 9 real-world tasks including folding cloths, sweeping beans etc. with just <b>179</b> image-action training pairs.
</h2> -->


<section class="section">
  <div class="container is-max-widescreen">

    <div class="rows">


    <!-- Animation. -->
    <div class="rows is-centered ">
      <div class="row is-full-width">
        <h2 class="title is-3">LoHoRavens Benchmark Examples</h2>
          <section class="hero is-light is-small">
            <div class="hero-body">
              <div class="container">
                <div id="results-carousel" class="carousel results-carousel">
                  <div class="item item-steve">
                    <video poster="" id="steve" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0002.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-fullbody">
                    <video poster="" id="fullbody" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0003.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-shiba">
                    <video poster="" id="shiba" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0004.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0005.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-shiba">
                    <video poster="" id="shiba" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0006.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-shiba">
                    <video poster="" id="shiba" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0007.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                <div class="item item-chair-tp">
                    <video poster="" id="chair-tp" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0008.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0009.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0010.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0011.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/ICRA2024-LoHoRavens-1080P-d1_V1-0012.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/put-hidden-blocks-in-pyramid-1.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                  <div class="item item-blueshirt">
                    <video poster="" id="blueshirt" autoplay controls muted loop height="100%">
                      <source src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/videos/move-blocks-between-absolute-positions-by-color-size-1.mp4"
                              type="video/mp4">
                    </video>
                  </div>
                </div>
              </div>
            </div>
          </section>
          <!-- <h2 class="subtitle has-text-centered">
          </br>
          There are 11 long-horizon language-conditioned tasks in LoHoRavens. 
            We learn <b>one multi-task policy</b> for 9 real-world tasks including folding cloths, sweeping beans etc. with just <b>179</b> image-action training pairs.
          </h2> -->

        <h2 class="title is-3">Baselines</h2>
        <!-- <div class="content has-text-justified">
          <p>
            It has been a mainstream method to use LLMs as the planner for a robot's execution.  
            However, how to incorporate real-time visual observation feedback into the
            LLM's input is still an under-explored problem.  
            This modality gap is especially severe for long-horizon robotic tasks because
            an execution error in each of the robot's steps can affect all the following steps.
            To solve this modality bridging problem, we propose two baseline methods to translate the visual observation into feedback that the LLM can understand for its closed-loop planning. 
            We use the Planner-Actor-Reporter paradigm to unify our two baselines.
            The feedback generation models of the two baselines are working as the Reporter module.
          </p>
        </div> -->


        <!-- Interpolating. -->
        <h3 class="title is-4">Imitation Learning based model (IL)</h3>
        <class="content has-text-justified">
        <div>
          <p>
          We use the same architecture and training recipe as CLIPort for the imitation learning baseline.
          Using multi-task training, the CLIPort model is trained with the train sets of all 20 seen tasks along with the three pick-and-place primitives for 100K steps. 
          Because the vanilla CLIPort does not know when to stop execution, following Inner Monologue and CaP, we use an oracle termination variant that uses the oracle information from the simulator to detect the success rate and stop the execution process.
          </p>
        </div>

        <br/>
        <br/>

        <h3 class="title is-4">Planner-Actor-Reporter based model (PAR)</h3>
        <img src="https://shengqiang-zhang.github.io/lohoravens-webpage/media/images/explicit_baseline.png" class="interpolation-image" 
         alt="Interpolate start reference image." />
        <div>
        <class="content has-text-justified">
          <p>
            The Planner-Actor-Reporter paradigm is frequently used in robotics.
            Usually, LLMs serve as the Planner due to their impressive planning and reasoning capabilities, 
            and humans or VLMs play the role of Reporter to provide necessary language feedback for the Planner's planning. 
            The Actor is the agent that interacts with the environment.
          </p>

          <p>
          As shown in the above figure, we use Llama 3 8B and the trained pick-and-place CLIPort primitive as the Planner and Actor, respectively.
          For the Reporter, we use the VLM CogVLM2.
          Theoretically, any type of feedback from the environment
          and the robot can be considered to inform the LLM planner
          as long as it can be stated verbally. However, considering the
          LoHoRavens simulated environment and the VLMs we use,
          we just prompt the VLMs to generate the following types of
          feedback:
          </p>

          <ul>
          <li><i>Observation state feedback</i>:
          Besides the human instruction at the beginning, the Planner needs to have the
          information about the objects on the table for the planning.
          Furthermore, if the states of the objects change, the VLM
          Reporter should describe the changes to the LLM Planner.</li>
          <li><i>Action success state feedback</i>:
          The robot Actor
          may fail to complete the instruction given by the LLM
          Planner. This kind of success state information (or rather
          failure information) should be conveyed to the Planner. The
          VLM Reporter will indicate in its description whether the
          last instruction is executed successfully or not. </li>
          </ul>
          
            
        </div>
        <!-- <br/>
            <b>Paradigm 1:</b> Unlike existing object detectors, CLIP is not limited to a predefined set of object classes. And unlike other vision-language models, it's not restricted by a top-down pipeline that detects objects with bounding boxes or instance segmentations. This allows us to forgo the traditional paradigm of training explicit detectors for cloths, pliers, chessboard squares, cherry stems, and other arbitrary things. 
        <br/> -->
        <br/>
        <br/>
        <!--/ Interpolating. -->

        <h3 class="title is-4">More baselines are being added ...</h3>
        <br/>
        <br/>


        <h2 class="title is-3">Results</h2>
        <img src="https://shengqiang-zhang.github.io/lohoravens-webpage//media/images/exp_results_dpi799.png" class="interpolation-image" 
         alt="Interpolate start reference image."/>
         <p>
        The above figure shows how the two baselines perform on all seen tasks and unseen tasks.
        Numbers are averages over all relevant tasks.
        We can see that the imitation learning based CLIPort model (IL) performs a little worse than the Planner-Actor-Reporter based model (PAR) on the seen tasks. 
        However, when generalizing to the unseen tasks, the IL model drops quit a lot while the PAR counterpart can have almost the same performance as on the seen tasks.
        The binary success rate of both models is quite low, indicating it's very hard for them to finish all the steps of the long-horizon tasks. 
        </p>
        <br/>
        <br/>
        <img src="https://shengqiang-zhang.github.io/lohoravens-webpage//media/images/capabilities_results_dpi799.png" class="interpolation-image" 
         alt="Interpolate start reference image."/>
         <p>
         As we can see from the above figure, the overall tendency is that the models' performance drops as the number of combinations of reasoning increases.
         This observation fits with our intuition that 
         the more reasoning capabilities the tasks require,
         the harder the tasks become.
         But there are still some exceptions violating this rule.
         Unexpectedly, the IL baseline performs better on the tasks requiring "color+size" capabilities than on the tasks requiring 
         "color+size+commonsense" capabilities.
         We speculate the reason is that "color+size+commonsense" tasks typically use commonsense to filter the objects needed to manipulate, thus this kind of task usually requires fewer steps to complete.
        </p>
         
        <p>
         Another interesting finding is that the two baselines perform differently regarding different reasoning capabilities.
         On the seen tasks requiring spatial reasoning capability, 
         the IL model usually performs better.
         It is probably because current LLMs and VLMs do not have
         good spatial understanding.
         In contrast, the PAR model usually outperforms the IL model on tasks requiring commonsense.
         Another observation is that the PAR model cannot deal with tasks requiring reference since LLMs cannot indicate the objects accurately if there is more than one object with the same size and color.
         This also prevents the PAR model from solving the tasks
         requiring arithmetic reasoning since these tasks usually
         comprise multiple objects of the same kind.
         </p>
        <p>
         The experiments also show that some tasks are extremely hard for both models. 
         For tasks that contain hidden objects, 
         both models struggle to reason to remove the top object that blocks the bottom target objects.
         Moreover, they 
         are almost completely unable to solve shape construction tasks.
         To summarize, LoHoRavens is a good resource to benchmark methods for robotic manipulation. 
         It is also a challenging benchmark that can facilitate future work on developing more advanced models on long-horizon robotic tasks.
        </p>


      </div>
    </div>

  </div>
</section>


<!-- <section class="section" id="BibTeX">
  <div class="container is-max-widescreen content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@inproceedings{shridhar2021cliport,
  title     = {CLIPort: What and Where Pathways for Robotic Manipulation},
  author    = {Shridhar, Mohit and Manuelli, Lucas and Fox, Dieter},
  booktitle = {Proceedings of the 5th Conference on Robot Learning (CoRL)},
  year      = {2021},
}</code></pre>
  </div>
</section> -->


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column">
        <div class="content has-text-centered">
          <p>
            Website template borrowed from <a href="https://cliport.github.io/">CLIPort</a>. 
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>