index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="GenSAM_logo.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="GenSAM_logo.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>InvSeg</title>
  <link rel="icon" type="image/x-icon" href="GenSAM_logo.png">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">InvSeg: Test-Time  Prompt Inversion for Semantic Segmentation</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://jylin8100.github.io/" target="_blank">Jiayi Lin</a>,</span>
                  <span class="author-block">
                    <a href="https://raymond-sci.github.io/" target="_blank">Jiabo Huang</a>,</span>
                  <span class="author-block">
                <a href="https://lwpyh.github.io/" target="_blank">Jian Hu</a>,</span>
                <span class="author-block">
                    <a href="http://www.eecs.qmul.ac.uk/~sgg/" target="_blank">Shaogang Gong</a>
                  </span>
                  </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">Queen Mary University of London</span>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                     <!-- ArXiv abstract Link -->
                    <span class="link-block">
                    <a href="https://arxiv.org/abs/2410.11473" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                    <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                    </a>
                    </span>

                    <!-- Supplementary PDF link -->
                    <span class="link-block">
                      <a href="supplementary_material.pdf" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                      </span>
                      <span>Supplementary</span>
                    </a>
                  </span>

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/jyLin8100/InvSeg" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <style>
            .content {
              font-family: "Times New Roman", Times, serif;
            }
          </style>
          <p>
            The precise visual-textual correlations embedded in the attention maps derived from text-to-image generative diffusion models have been shown beneficial to open-vocabulary dense visual prediction tasks,e.g., semantic segmentation.
          However, a significant challenge arises due to the distributional discrepancy between the context-rich sentences used for image generation training and the isolated class names typically available for visual discrimination. This discrepancy in the richness of textual context limits the effectiveness of diffusion models in capturing accurate visual-textual correlations.
            To tackle this challenge, we propose a novel approach called InvSeg, a test-time prompt inversion method that leverages per-image visual context to optimize the context-insufficient text prompts composed of isolated class names, so as to associate every pixel and class for open-vocabulary semantic segmentation. Specifically, we introduces a Contrastive Soft Clustering (CSC) method to derive the underlying structure of images
according to the assumption that different objects usually occupy distinct while continuous areas within visual scenes. Such structural information is then used to constrain the image-text cross-attention for calibrating the input class embeddings without requiring any manual label or additional training data. By incorporating sample-specific context at test time, InvSeg learns context-rich text prompts in embedding space to achieve accurate semantic alignment across modalities. Experiments show that InvSeg achieves state-of-the-art performance on the PASCAL VOC and Context datasets. 
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->

  <!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container" style="display: flex; justify-content: center; align-items: center; flex-direction: column;">
        <h2 class="title is-2 has-text-centered">Our Framework</h2>
        <img src="fig2_v2.png" alt="MY ALT TEXT" style="width: 1000px; height: auto; margin-top: 30px; display: block;">
        <h2 class="subtitle" style="max-width: 75%; margin: 0 auto; text-align: justify;">
          <br>The framework of <strong>InvSeg</strong>. Our proposed Contrastive Soft Clustering method can achieve region-level prompt inversion. The text tokens are first initialized with the pretrained text encoder from the diffusion model (dashed box on left) and then are used as the only learnable parameters during the test time training. After the adaption process, the learned text tokens can be used to derive a more accurate and complete refined attention maps $$\{M\}$$ for segmentation.
    </div>
  </div>
</div>
<!-- End image carousel -->

<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container" style="display: flex; justify-content: center; align-items: center; flex-direction: column;">
        <h2 class="title is-2 has-text-centered">Comparison with SOTA</h2>
        <img src="experiment_invseg.png" alt="MY ALT TEXT" style="width: 1000px; height: auto; margin-top: 30px; display: block;">
        <h2 class="subtitle" style="max-width: 75%; margin: 0 auto; text-align: justify;">
          <br><strong>Comparison with existing methods</strong>. Models in the first three rows are finetuned on target datasets while the rest approaches do not require mask annotations. <strong>Bold fonts</strong> refer to the best results among the models and <u>underline fonts</u> refer to the second best.
    </div>
  </div>
</div>
<!-- End image carousel -->

  
<!-- Image carousel -->
<section class="hero is-small">
  <div class="hero-body">
    <div class="container" style="display: flex; justify-content: center; align-items: center; flex-direction: column;">
    <h2 class="title is-2 has-text-centered">Visualization</h2>
    <img src="example_vis.png" alt="MY ALT TEXT" style="width: 1000px; height: auto; margin-top: 30px; display: block;">
    <h2 class="subtitle" style="max-width: 75%; margin: 0 auto; text-align: justify;">
        Examples of Segmentation on VOC (top), Context (middle) and COCO (bottom). For each sample (image group of four), from left to right is input, GT, InvSeg, Diffusion baseline.
    </h2>
    <img src="attention.png" alt="MY ALT TEXT" style="width: 1000px; height: auto; margin-top: 30px; display: block;">
    <h2 class="subtitle" style="max-width: 75%; margin: 0 auto; text-align: justify;">
        Visualization of refined cross-attention maps derived from text prompts before (top) and after (bottom) prompt inversion. Before prompt inversion, the segmentation of background elements such as "grass" or "trees" is influenced by foreground objects like "cow" or "horse", resulting in mistakenly ignoring background classes or segmenting foreground (and background) classes. After applying prompt inversion, this phenomenon is suppressed by improving the distinction between foreground and background through proposed Contrastive Soft Clustering.
    </h2>
</div>

  </div>
</div>
<!-- End image carousel -->
  
  
<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@misc{lin2024invsegtesttimepromptinversion,
      title={InvSeg: Test-Time Prompt Inversion for Semantic Segmentation}, 
      author={Jiayi Lin and Jiabo Huang and Jian Hu and Shaogang Gong},
      year={2024},
      eprint={2410.11473},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2410.11473}, 
}</code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>