index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="EditRoom: LLM-parameterized Graph Diffusion for Composable 3D Room Layout Editing">
  <meta name="keywords" content="Multimodal Generation, Interleaved Vision-and-Language Generation">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>EditRoom: LLM-parameterized Graph Diffusion for Composable 3D Room Layout Editing</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="icon" href="./static/images/icon.png">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">EditRoom: LLM-parameterized Graph Diffusion for Composable 3D Room Layout Editing</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://kzzheng.github.io/">Kaizhi Zheng</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://sites.google.com/umich.edu/xiaotong-chen/home">Xiaotong Chen</a><sup>3</sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.com/citations?user=kDzxOzUAAAAJ&hl=en">Xuehai He</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://g-jing.github.io">Jing Gu</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://www.microsoft.com/en-us/research/people/linjli/">Linjie Li</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="https://zyang-ur.github.io/">Zhengyuan Yang</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="https://sites.google.com/site/kevinlin311tw/me">Kevin Lin</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="https://jianfengwang.me/">Jianfeng Wang</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="https://www.microsoft.com/en-us/research/people/lijuanw/">Lijuan Wang</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="https://eric-xw.github.io/">Xin Eric Wang</a><sup>1</sup>,</span>
          </div>
          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>University of California, Santa Cruz,</span>
            <span class="author-block"><sup>2</sup>Microsoft,</span>
            <span class="author-block"><sup>3</sup>University of Michigan, Ann Arbor</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="https://arxiv.org/pdf/2410.12836"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://arxiv.org/abs/2410.12836"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Code Link. -->
              <!-- <span class="link-block">
                <a href="https://github.com/eric-ai-lab/MiniGPT-5"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span> -->
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img id="teaser" width="150%" src="./static/images/new_teaser-1.png">
      <h2 class="subtitle has-text-centered">
        <p style="font-family:Times New Roman"><b>Figure 1. Editing Pipeline with <b>EditRoom</b>. <b>EditRoom</b> is a unified language-guided 3D scene layout 
          editing framework that can automatically execute all layout editing types with natural language
          commands, which includes the command parameterizer for natural language comprehension and
          the scene editor for editing execution. Given a source scene and natural language commands, it can
          generate a coherent and appropriate target scene. </b></p>
      </h2>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Given the steep learning curve of professional 3D software and the timeconsuming process of managing large 3D assets, language-guided 3D scene editing has significant potential in fields such as virtual reality, augmented reality, and
            gaming. However, recent approaches to language-guided 3D scene editing either
            require manual interventions or focus only on appearance modifications without
            supporting comprehensive scene layout changes. In response, we propose <b>EditRoom</b>, a unified framework capable of executing a variety of layout edits through
            natural language commands, without requiring manual intervention. Specifically,
            <b>EditRoom</b> leverages Large Language Models (LLMs) for command planning and
            generates target scenes using a diffusion-based method, enabling six types of edits: rotate, translate, scale, replace, add, and remove. To address
            the lack of data for language-guided 3D scene editing, we have developed an automatic pipeline to augment existing 3D scene synthesis datasets and introduced
            <b>EditRoom-DB</b>, a large-scale dataset with 83k editing pairs, for training and evaluation. Our experiments demonstrate that our approach consistently outperforms
            other baselines across all metrics, indicating higher accuracy and coherence in
            language-guided scene layout editing.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-five-fifths">
        <h2 class="title is-3"><img id="painting_icon" width="5%" src="https://cdn-icons-png.flaticon.com/512/5379/5379860.png"> Unified Scene Layout Editing </h2> 
      </div>
    </div>

        <div class="columns is-centered has-text-centered">
          <div class="column is-six-fifths">
            <div class="content has-text-justified">
              <ul>
                <li>We leverage the pretrained multimodal large language model (GPT-4o) as command parameterizer and graph diffusion-based method as scene editor to create a unified scene layout editing pipeline. </li>
                <li>Larget language model can convert natural language commands into breakdown commands by given source scene information. </li>
                <li>SceneEditor take the breakdown commands and source scene as input, and it can first generate the abstract target scene graph, then it will estimate the accurate target scene layout.</li>
              </ul>
            </div>        
            <img id="model" width="100%" src="./static/images/editor_method-1.png">
            <h3 class="subtitle has-text-centered">
              <p style="font-family:Times New Roman"><b>Figure 2. Scene Editor aims to provide accurate, coherent editing results according to the given source scene and language commands. 
                It consists of two graph transformer-based conditional diffusion models. One diffusion model generates semantic target scene graphs. 
                Another diffusion model can estimate accurate poses and size information for each object inside the generated target scene graphs. 
                All diffusion processes are conditioned on the source scene and breakdown command.</b></p>
            </h3>   


        </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-five-fifths">
        <h2 class="title is-3"><img id="painting_icon" width="5%" src="https://cdn-icons-png.flaticon.com/512/5379/5379860.png"> Qualitative Comparison</h2> 
      </div>
    </div>
        <div class="columns is-centered has-text-centered">
          <div class="column is-six-fifths">
            <div class="content has-text-justified">
              <p>
                Qualitative examples from <b>EditRoom</b> and baselines on single- and multi-operation editing.  From the comparisons, we can find the <b>EditRoom</b> can provide more accurate and coherent editing results than other baselines, and it can generalize to multi-operation editing tasks without training on such data.
              </p>
            </div>        
            <img id="model" width="100%" src="./static/images/single_6actions-1.png">
            <h3 class="subtitle has-text-centered">
              <p style="font-family:Times New Roman"><b>Figure 3. Comparison with other baselines on single-operation editing. </b></p>
            </h3>
            <img id="model" width="100%" src="./static/images/multi-1.png">
            <h3 class="subtitle has-text-centered">
              <p style="font-family:Times New Roman"><b>Figure 4. Comparison with other baselines on multi-operation editing. </b></p>
            </h3>   
        </div>
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@misc{zheng2024editroomllmparameterizedgraphdiffusion,
      title={EditRoom: LLM-parameterized Graph Diffusion for Composable 3D Room Layout Editing}, 
      author={Kaizhi Zheng and Xiaotong Chen and Xuehai He and Jing Gu and Linjie Li and Zhengyuan Yang and Kevin Lin and Jianfeng Wang and Lijuan Wang and Xin Eric Wang},
      year={2024},
      eprint={2410.12836},
      archivePrefix={arXiv},
      primaryClass={cs.GR},
      url={https://arxiv.org/abs/2410.12836}, 
    }
    </code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is adapted from <a rel="license"
            href="https://github.com/nerfies/nerfies.github.io">Nerfies</a> and <a rel="license"
            href="https://gligen.github.io/">GLIGEN</a>, licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
            This website is licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>