onnx

lkeab · lkeab · commit 0d4526f3707a · 2023-06-28T01:35:51.000+02:00
diff --git a/README.md b/README.md
@@ -8,7 +8,9 @@ We propose HQ-SAM to upgrade SAM for high-quality zero-shot segmentation. Refer
 
 Updates
 -----------------
-:fire::fire: Play with HQ-SAM demo at [![Huggingfaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sam-hq-team/sam-hq), supported by [Huggingface Spaces](https://huggingface.co/spaces), which supports point, box and text prompts.
+:fire::fire: We release the [ONNX export script](#onnx-export) and [colab notebook](https://colab.research.google.com/drive/11U2La49c2IxahzJkAV-EzPqEH3cz_5hq?usp=sharing) for exporting and using ONNX model.
+
+:fire: Play with HQ-SAM demo at [![Huggingfaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sam-hq-team/sam-hq), supported by [Huggingface Spaces](https://huggingface.co/spaces), which supports point, box and text prompts.
 
 :fire: We released the [colab notebook demo](https://colab.research.google.com/drive/1QwAbn5hsdqKOD5niuBzuqQX4eLCbNKFL?usp=sharing) and [automatic mask generator notebook](https://colab.research.google.com/drive/1dhRq4eR6Fbl-yl1vbQvU9hqyyeOidQaU?usp=sharing).
 
@@ -133,6 +135,14 @@ python demo/demo_sam.py
 ```
 
 
+### **ONNX export**
+HQ-SAM's lightweight mask decoder can be exported to ONNX format so that it can be run in any environment that supports ONNX runtime. Export the model with
+```
+python scripts/export_onnx_model.py --checkpoint <path/to/checkpoint> --model-type <model_type> --output <path/to/output>
+```
+See the [example notebook](https://colab.research.google.com/drive/11U2La49c2IxahzJkAV-EzPqEH3cz_5hq?usp=sharing) for details on how to combine image preprocessing via HQ-SAM's backbone with mask prediction using the ONNX model. It is recommended to use the latest stable version of PyTorch for ONNX export.
+
+
 Citation
 ---------------
 If you find HQ-SAM useful in your research or refer to the provided baseline results, please star :star: this repository and consider citing :pencil::
diff --git a/scripts/export_onnx_model.py b/scripts/export_onnx_model.py
@@ -0,0 +1,219 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from segment_anything import sam_model_registry
+from segment_anything.utils.onnx import SamOnnxModel
+
+import argparse
+import warnings
+
+try:
+    import onnxruntime  # type: ignore
+
+    onnxruntime_exists = True
+except ImportError:
+    onnxruntime_exists = False
+
+parser = argparse.ArgumentParser(
+    description="Export the SAM prompt encoder and mask decoder to an ONNX model."
+)
+
+parser.add_argument(
+    "--checkpoint", type=str, required=True, help="The path to the SAM model checkpoint."
+)
+
+parser.add_argument(
+    "--output", type=str, required=True, help="The filename to save the ONNX model to."
+)
+
+parser.add_argument(
+    "--model-type",
+    type=str,
+    required=True,
+    help="In ['default', 'vit_h', 'vit_l', 'vit_b']. Which type of SAM model to export.",
+)
+
+parser.add_argument(
+    "--hq-token-only",
+    action="store_true",
+    help=(
+        "False means use hq output to correct SAM output. True means use hq output only. Default: False "
+        "To achieve best visualization effect, for images contain multiple objects (like typical coco images),"
+        "We suggest to set hq_token_only=False. For images contain single object, we suggest to set hq_token_only = True"
+        "For quantiative evaluation on COCO/YTVOS/DAVIS/UVO/LVIS etc., we set hq_token_only = False."
+    ),
+)
+
+
+parser.add_argument(
+    "--multimask-output",
+    action="store_true",
+    help=(
+        "If true, the exported ONNX model will use multi-mask output mode and "
+        "select the best mask in multi-mask"
+    ),
+)
+
+parser.add_argument(
+    "--opset",
+    type=int,
+    default=17,
+    help="The ONNX opset version to use. Must be >=11",
+)
+
+parser.add_argument(
+    "--quantize-out",
+    type=str,
+    default=None,
+    help=(
+        "If set, will quantize the model and save it with this name. "
+        "Quantization is performed with quantize_dynamic from onnxruntime.quantization.quantize."
+    ),
+)
+
+parser.add_argument(
+    "--gelu-approximate",
+    action="store_true",
+    help=(
+        "Replace GELU operations with approximations using tanh. Useful "
+        "for some runtimes that have slow or unimplemented erf ops, used in GELU."
+    ),
+)
+
+parser.add_argument(
+    "--use-stability-score",
+    action="store_true",
+    help=(
+        "Replaces the model's predicted mask quality score with the stability "
+        "score calculated on the low resolution masks using an offset of 1.0. "
+    ),
+)
+
+parser.add_argument(
+    "--return-extra-metrics",
+    action="store_true",
+    help=(
+        "The model will return five results: (masks, scores, stability_scores, "
+        "areas, low_res_logits) instead of the usual three. This can be "
+        "significantly slower for high resolution outputs."
+    ),
+)
+
+
+def run_export(
+    model_type: str,
+    checkpoint: str,
+    output: str,
+    opset: int,
+    hq_token_only: bool = False,
+    multimask_output: bool = False,
+    gelu_approximate: bool = False,
+    use_stability_score: bool = False,
+    return_extra_metrics=False,
+):
+    print("Loading model...")
+    sam = sam_model_registry[model_type](checkpoint=checkpoint)
+
+    onnx_model = SamOnnxModel(
+        model=sam,
+        hq_token_only=hq_token_only,
+        multimask_output=multimask_output,
+        use_stability_score=use_stability_score,
+        return_extra_metrics=return_extra_metrics,
+    )
+
+    if gelu_approximate:
+        for n, m in onnx_model.named_modules():
+            if isinstance(m, torch.nn.GELU):
+                m.approximate = "tanh"
+
+    dynamic_axes = {
+        "point_coords": {1: "num_points"},
+        "point_labels": {1: "num_points"},
+    }
+
+    embed_dim = sam.prompt_encoder.embed_dim
+    embed_size = sam.prompt_encoder.image_embedding_size
+    encoder_embed_dim_dict = {"vit_b":768,"vit_l":1024,"vit_h":1280}
+    encoder_embed_dim = encoder_embed_dim_dict[model_type]
+
+    mask_input_size = [4 * x for x in embed_size]
+    dummy_inputs = {
+        "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float),
+        "interm_embeddings": torch.randn(4, 1, *embed_size, encoder_embed_dim, dtype=torch.float),
+        "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float),
+        "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float),
+        "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float),
+        "has_mask_input": torch.tensor([1], dtype=torch.float),
+        "orig_im_size": torch.tensor([1500, 2250], dtype=torch.float),
+    }
+
+    _ = onnx_model(**dummy_inputs)
+
+    output_names = ["masks", "iou_predictions", "low_res_masks"]
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
+        warnings.filterwarnings("ignore", category=UserWarning)
+        with open(output, "wb") as f:
+            print(f"Exporting onnx model to {output}...")
+            torch.onnx.export(
+                onnx_model,
+                tuple(dummy_inputs.values()),
+                f,
+                export_params=True,
+                verbose=False,
+                opset_version=opset,
+                do_constant_folding=True,
+                input_names=list(dummy_inputs.keys()),
+                output_names=output_names,
+                dynamic_axes=dynamic_axes,
+            )
+
+    if onnxruntime_exists:
+        ort_inputs = {k: to_numpy(v) for k, v in dummy_inputs.items()}
+        # set cpu provider default
+        providers = ["CPUExecutionProvider"]
+        ort_session = onnxruntime.InferenceSession(output, providers=providers)
+        _ = ort_session.run(None, ort_inputs)
+        print("Model has successfully been run with ONNXRuntime.")
+
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    run_export(
+        model_type=args.model_type,
+        checkpoint=args.checkpoint,
+        output=args.output,
+        opset=args.opset,
+        hq_token_only=args.hq_token_only,
+        multimask_output=args.multimask_output,
+        gelu_approximate=args.gelu_approximate,
+        use_stability_score=args.use_stability_score,
+        return_extra_metrics=args.return_extra_metrics,
+    )
+
+    if args.quantize_out is not None:
+        assert onnxruntime_exists, "onnxruntime is required to quantize the model."
+        from onnxruntime.quantization import QuantType  # type: ignore
+        from onnxruntime.quantization.quantize import quantize_dynamic  # type: ignore
+
+        print(f"Quantizing model and writing to {args.quantize_out}...")
+        quantize_dynamic(
+            model_input=args.output,
+            model_output=args.quantize_out,
+            optimize_model=True,
+            per_channel=False,
+            reduce_range=False,
+            weight_type=QuantType.QUInt8,
+        )
+        print("Done!")
diff --git a/segment_anything/utils/onnx.py b/segment_anything/utils/onnx.py
@@ -25,15 +25,17 @@ class SamOnnxModel(nn.Module):
     def __init__(
         self,
         model: Sam,
-        return_single_mask: bool,
+        hq_token_only: bool = False,
+        multimask_output: bool = False,
         use_stability_score: bool = False,
         return_extra_metrics: bool = False,
     ) -> None:
         super().__init__()
         self.mask_decoder = model.mask_decoder
         self.model = model
         self.img_size = model.image_encoder.img_size
-        self.return_single_mask = return_single_mask
+        self.hq_token_only = hq_token_only
+        self.multimask_output = multimask_output
         self.use_stability_score = use_stability_score
         self.stability_score_offset = 1.0
         self.return_extra_metrics = return_extra_metrics
@@ -89,25 +91,12 @@ def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -
         masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
         return masks
 
-    def select_masks(
-        self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Determine if we should return the multiclick mask or not from the number of points.
-        # The reweighting is used to avoid control flow.
-        score_reweight = torch.tensor(
-            [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
-        ).to(iou_preds.device)
-        score = iou_preds + (num_points - 2.5) * score_reweight
-        best_idx = torch.argmax(score, dim=1)
-        masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
-        iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
-
-        return masks, iou_preds
 
     @torch.no_grad()
     def forward(
         self,
         image_embeddings: torch.Tensor,
+        interm_embeddings: torch.Tensor,
         point_coords: torch.Tensor,
         point_labels: torch.Tensor,
         mask_input: torch.Tensor,
@@ -117,20 +106,42 @@ def forward(
         sparse_embedding = self._embed_points(point_coords, point_labels)
         dense_embedding = self._embed_masks(mask_input, has_mask_input)
 
+        vit_features = interm_embeddings[0].permute(0, 3, 1, 2) # early-layer ViT feature, after 1st global attention block in ViT
+        hq_features = self.model.mask_decoder.embedding_encoder(image_embeddings) + self.model.mask_decoder.compress_vit_feat(vit_features)
+
         masks, scores = self.model.mask_decoder.predict_masks(
             image_embeddings=image_embeddings,
             image_pe=self.model.prompt_encoder.get_dense_pe(),
             sparse_prompt_embeddings=sparse_embedding,
             dense_prompt_embeddings=dense_embedding,
+            hq_features=hq_features,
         )
 
         if self.use_stability_score:
             scores = calculate_stability_score(
                 masks, self.model.mask_threshold, self.stability_score_offset
             )
 
-        if self.return_single_mask:
-            masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
+        if self.multimask_output:
+            # mask with highest score
+            mask_slice = slice(1,self.model.mask_decoder.num_mask_tokens-1)
+            scores = scores[:, mask_slice]
+            scores, max_iou_idx = torch.max(scores,dim=1)
+            scores = scores.unsqueeze(1)
+            masks_multi = masks[:, mask_slice, :, :]
+            masks_sam = masks_multi[torch.arange(masks_multi.size(0)),max_iou_idx].unsqueeze(1)
+        else:
+            # singale mask output, default
+            mask_slice = slice(0, 1)
+            scores = scores[:,mask_slice]
+            masks_sam = masks[:,mask_slice]
+
+        masks_hq = masks[:,slice(self.model.mask_decoder.num_mask_tokens-1, self.model.mask_decoder.num_mask_tokens)]
+
+        if self.hq_token_only:
+            masks = masks_hq
+        else:
+            masks = masks_sam + masks_hq
 
         upscaled_masks = self.mask_postprocessing(masks, orig_im_size)