From a6d5795d10e8feb558939629154159bd358c251a Mon Sep 17 00:00:00 2001 From: Alexander Barabanov <97449232+AlexanderBarabanov@users.noreply.github.com> Date: Thu, 16 Jan 2025 12:43:09 +0000 Subject: [PATCH 1/9] Security scan pipeline update (#4177) * codeql settings update * codeql fix * update trivy * update bandit * trivy fix * trivy fix * trivy fix * json output * trivy fix * trivy spdx * codeql added * bandit update * remove bandit B320 * remove bandit B410 * remove workflow_dispatch * revert trivy yaml * fix format --- .ci/ipas_default.config | 6 +- .github/workflows/code_scan.yaml | 134 +++++++++++++++++++++++++++---- .github/workflows/codeql.yaml | 37 +++++---- tox.ini | 2 +- 4 files changed, 143 insertions(+), 36 deletions(-) diff --git a/.ci/ipas_default.config b/.ci/ipas_default.config index 4bc8b481e39..95dd511397c 100644 --- a/.ci/ipas_default.config +++ b/.ci/ipas_default.config @@ -40,7 +40,7 @@ # B317 : xml_bad_sax # B318 : xml_bad_minidom # B319 : xml_bad_pulldom -# B320 : xml_bad_etree +# B320 : xml_bad_etree - removed https://github.com/PyCQA/bandit/commit/e4da0b351f89a82b5de8dd791cbdd963476b5a11 # B321 : ftplib # B323 : unverified_context # B324 : hashlib_new_insecure_functions @@ -53,7 +53,7 @@ # B407 : import_xml_expat # B408 : import_xml_minidom # B409 : import_xml_pulldom -# B410 : import_lxml +# B410 : import_lxml - removed https://github.com/PyCQA/bandit/commit/e4da0b351f89a82b5de8dd791cbdd963476b5a11 # B411 : import_xmlrpclib # B412 : import_httpoxy # B413 : import_pycrypto @@ -83,7 +83,7 @@ # IPAS Required Checkers. Do not disable these # Additional checkers may be added if desired tests: - [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413'] + [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B411', 'B412', 'B413'] # (optional) list skipped test IDs here, eg '[B101, B406]': # The following checkers are not required but be added to tests list if desired diff --git a/.github/workflows/code_scan.yaml b/.github/workflows/code_scan.yaml index ad66b1d55a2..85d0d8abb1d 100644 --- a/.github/workflows/code_scan.yaml +++ b/.github/workflows/code_scan.yaml @@ -10,12 +10,11 @@ on: # every UTC 6PM from Mon to Fri - cron: "0 18 * * 1-5" -# Declare default permissions as read only. -permissions: read-all +permissions: {} jobs: - Trivy-scan: - runs-on: ubuntu-latest + Trivy: + runs-on: ubuntu-22.04 steps: - name: Checkout code uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -27,27 +26,52 @@ jobs: run: python -m pip install --require-hashes --no-deps -r .ci/requirements.txt - name: Freeze dependencies run: pip-compile --extra=docs,base,mmlab,anomaly -o requirements.txt pyproject.toml - - name: Trivy Scanning (spdx.json) - uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0 + + - name: Run Trivy Scan (vuln) + uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # v0.29.0 with: - trivy-config: ".ci/trivy-json.yaml" - scan-type: "fs" + scan-type: fs + scan-ref: requirements.txt + scanners: vuln + output: trivy-results-vuln.txt + + - name: Run Trivy Scan (dockerfile and secrets) + uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # v0.29.0 + with: + scan-type: fs scan-ref: . - - name: Trivy Scanning + scanners: misconfig,secret + output: trivy-results-misconfig.txt + skip-setup-trivy: true + + - name: Trivy Scanning (spdx) uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0 with: - trivy-config: ".ci/trivy.yaml" - scan-type: "fs" + scan-type: fs scan-ref: . + format: spdx-json + output: trivy-results-spdx.json + skip-setup-trivy: true + - name: Upload Trivy results artifact uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: trivy-results - path: "${{ github.workspace }}/trivy-results.*" + path: "${{ github.workspace }}/trivy-results-*" + retention-days: 7 # Use always() to always run this step to publish scan results when there are test failures if: ${{ always() }} + + - name: Upload deps list + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + if: always() + with: + name: python-deps-list + path: "${{ github.workspace }}/requirements.txt" + retention-days: 7 + Bandit: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -66,7 +90,87 @@ jobs: - name: Upload Bandit artifact uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: - name: bandit-report - path: .tox/bandit-report.txt + name: bandit-results + path: .tox/bandit-results.txt + retention-days: 7 # Use always() to always run this step to publish scan results when there are test failures if: ${{ always() }} + + CodeQL: + name: Analyze (${{ matrix.language }}) + runs-on: ubuntu-22.04 + permissions: + # required for all workflows + security-events: write + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + - language: actions # to scan workflows + build-mode: none + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 + with: + category: "/language:${{matrix.language}}" + + - name: Generate CodeQL Report + uses: rsdmike/github-security-report-action@a149b24539044c92786ec39af8ba38c93496495d # v3.0.4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + template: report + outputDir: codeql-${{ matrix.language }} + + - name: Rename Report + shell: bash + continue-on-error: true + run: | + cd codeql-${{ matrix.language }} + mv "report.pdf" "codeql-${{ matrix.language }}.pdf" + + - name: Upload Report + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + with: + name: codeql-${{ matrix.language }}-results + path: codeql-${{ matrix.language }}/*.pdf + retention-days: 7 + + Summarize: + needs: [Trivy, Bandit, CodeQL] + if: always() + runs-on: ubuntu-22.04 + steps: + # Create directory first + - name: Create results directory + run: mkdir -p all-results + + # Download artifacts with error handling + - name: Download all results + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + continue-on-error: true # Don't fail if some tools didn't generate results + with: + pattern: "*-results" + merge-multiple: true + path: all-results + + # Only upload if there are files + - name: Upload combined results + if: hashFiles('all-results/**/*') != '' + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + with: + name: security-scan-results + path: all-results + retention-days: 7 diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml index 27a904f9445..49e78c1ac6d 100644 --- a/.github/workflows/codeql.yaml +++ b/.github/workflows/codeql.yaml @@ -12,20 +12,11 @@ name: "CodeQL" on: - push: - branches: - - develop - - releases/** pull_request: types: - opened - reopened - synchronize - schedule: - - cron: "0 0 * * 0" - -permissions: - contents: read jobs: analyze: @@ -35,20 +26,20 @@ jobs: # - https://gh.io/supported-runners-and-hardware-resources # - https://gh.io/using-larger-runners # Consider using larger runners for possible analysis time improvements. - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 timeout-minutes: 60 permissions: # required for all workflows security-events: write - # only required for workflows in private repositories - actions: read - contents: read - strategy: fail-fast: false matrix: - language: ["python"] + include: + - language: python + build-mode: none + - language: actions # to scan workflows + build-mode: none # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ] # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both @@ -63,6 +54,7 @@ jobs: uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 with: languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. @@ -74,13 +66,24 @@ jobs: uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 with: category: "/language:${{matrix.language}}" + - name: Generate Security Report uses: rsdmike/github-security-report-action@a149b24539044c92786ec39af8ba38c93496495d # v3.0.4 with: template: report token: ${{ secrets.GITHUB_TOKEN }} + outputDir: codeql-${{ matrix.language }} + + - name: Rename Report + shell: bash + continue-on-error: true + run: | + cd codeql-${{ matrix.language }} + mv "report.pdf" "codeql-${{ matrix.language }}.pdf" + - name: GitHub Upload Release Artifacts uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: - name: codeql-report - path: "./report.pdf" + name: codeql-${{ matrix.language }}-results + path: codeql-${{ matrix.language }}/*.pdf + retention-days: 7 diff --git a/tox.ini b/tox.ini index 7aa0fa1ad5b..a4d8d7ac0db 100644 --- a/tox.ini +++ b/tox.ini @@ -99,7 +99,7 @@ deps = allowlist_externals = bandit commands = - - bandit -r -c .ci/ipas_default.config {toxinidir}/ -f txt -o {toxworkdir}/bandit-report.txt + - bandit -r -c .ci/ipas_default.config -f txt -o {toxworkdir}/bandit-results.txt . [testenv:fuzzing] From d663fd724f3c4ceacb186a5b8a7cb55136c6aacb Mon Sep 17 00:00:00 2001 From: Eugene Liu Date: Fri, 17 Jan 2025 13:26:08 +0000 Subject: [PATCH 2/9] OTX D-Fine Detection Algorithm Integration (#4142) * init * remove convertbox * Refactor D-FINE detector: remove unused components and update model configuration * update * update * Update * update recipes * Add d-fine-m * Fix recipes * dfine-l * Add dfine m - no aug * format changes * learnable params + disable teacher distillation * update * add recipes * update * update * update recipes * add dfine_hgnetv2_x * Update recipes * add tile DFine recipes * update recipes and tile batch size * update * update LR * DFine revert LR changes * make multi-scale optional * update tile recipes * update tiling recipes * add backbone pretrained weights * updawte * update * loss * update * Update * refactor d-fine criterion * * Fix docstring punctuation and remove unused aux_loss parameter in DFINETransformerModule * Refactor DFineCriterion * Update style changes * conv batchnorm fuse * update hybrid encoder * Refactor DFINE HybridEncoderModule to improve code clarity and remove redundant parameters * minor update * Refactor D-FINE module structure by removing obsolete detector file and reorganizing imports * Refactor import paths in D-FINE module and clean up unused code * Refactor D-FINE module by removing commented code, cleaning up imports, and updating documentation * Refactor D-FINE module by updating type hints, improving error messages, and enhancing documentation for RandomIoUCrop * Refactor D-FINE module by improving the weighting function's return structure and updating type hints in DFINECriterion * Update d-fine unit test * Refactor D-FINE module by enhancing docstrings for clarity and updating parameter names for consistency * Add D-Fine Detection Algorithm entries to CHANGELOG and object detection documentation * Fix device assignment for positional embeddings in HybridEncoderModule * Refactor D-FINE module by removing unused functions and integrating dfine_bbox2distance in DFINECriterion * Update codeowners * Add advanced parameters to optimization config in DFine model * Remove DFINE M, S, N model configuration files * disable tiling mem cache * Update codeowners * revert codeowner changes * Remove unused DFINE model configurations from unit tests * Add heavy unit test workflow and mark tests accordingly * Add container configuration for Heavy-Unit-Test job in pre_merge.yaml * Add additional transformations to D-Fine configuration and update test skips for unsupported models * Reduce batch size and remove heavy markers from unit tests in test_tiling.py * Revert "Add additional transformations to D-Fine configuration and update test skips for unsupported models" This reverts commit d5c66f54c0eecf3f1b038c0c347eadddb7021c1f. * Revert "Reduce batch size and remove heavy markers from unit tests in test_tiling.py" This reverts commit 563e0331a99b3792c0eca6fa28f15ffd07e394d8. * Add additional transformations to D-Fine configuration in YAML files * disable pytest heavy tag * update * Remove unused DFine-L model configurations and update unit tests * Add DFine-X model template for class-incremental object detection * Update docs/source/guide/explanation/algorithms/object_detection/object_detection.rst Co-authored-by: Samet Akcay * Update copyright years from 2024 to 2025 in multiple files * Rename heavy unit tests to intense unit tests and update related configurations * Update container image in pre_merge.yaml for Intense-Unit-Test job * update pre-merge * update ubuntu container image * update container image * Add new object detection model configuration for DFine HGNetV2 X * update image * Update pre-merge workflow to use Ubuntu 24.04 and simplify unit test coverage reporting * install sqlite * Remove sudo from apt-get command in pre-merge workflow * Remove sudo from apt-get command in pre-merge workflow * Update pre-merge workflow to install additional dependencies and correct model name in converter * Update detection configuration: increase warmup steps and patience, add min_lr, and remove unused callbacks * Remove D-Fine model recipes from object detection documentation * Skip tests for unsupported models: add check for D-Fine * Skip tests for unsupported models: add check for D-Fine * Skip tests for unsupported models: add check for DFine * Refactor DFine model: remove unused checkpoint loading and update optimizer configuration documentation; change reg_scale to float in DFINETransformer. --------- Co-authored-by: Samet Akcay --- .github/workflows/pre_merge.yaml | 32 + CHANGELOG.md | 2 + .../object_detection/object_detection.rst | 2 + pyproject.toml | 3 +- .../algo/common/layers/transformer_layers.py | 148 ++- src/otx/algo/detection/backbones/hgnetv2.py | 640 ++++++++++++ src/otx/algo/detection/d_fine.py | 308 ++++++ src/otx/algo/detection/heads/__init__.py | 13 +- src/otx/algo/detection/heads/dfine_decoder.py | 935 ++++++++++++++++++ src/otx/algo/detection/layers/csp_layer.py | 10 +- src/otx/algo/detection/losses/__init__.py | 4 +- src/otx/algo/detection/losses/dfine_loss.py | 501 ++++++++++ .../detection/necks/dfine_hybrid_encoder.py | 438 ++++++++ src/otx/algo/detection/utils/utils.py | 165 +++- .../core/data/transform_libs/torchvision.py | 43 +- src/otx/recipe/detection/dfine_x.yaml | 129 +++ src/otx/recipe/detection/dfine_x_tile.yaml | 125 +++ src/otx/tools/converter.py | 4 + .../detection/detection/dfine_x/template.yaml | 46 + tests/integration/api/test_xai.py | 10 + tests/integration/cli/test_cli.py | 6 + tests/unit/algo/detection/test_dfine.py | 158 +++ tests/unit/core/data/test_tiling.py | 17 + tox.ini | 8 +- 24 files changed, 3736 insertions(+), 11 deletions(-) create mode 100644 src/otx/algo/detection/backbones/hgnetv2.py create mode 100644 src/otx/algo/detection/d_fine.py create mode 100644 src/otx/algo/detection/heads/dfine_decoder.py create mode 100644 src/otx/algo/detection/losses/dfine_loss.py create mode 100644 src/otx/algo/detection/necks/dfine_hybrid_encoder.py create mode 100644 src/otx/recipe/detection/dfine_x.yaml create mode 100644 src/otx/recipe/detection/dfine_x_tile.yaml create mode 100644 src/otx/tools/templates/detection/detection/dfine_x/template.yaml create mode 100644 tests/unit/algo/detection/test_dfine.py diff --git a/.github/workflows/pre_merge.yaml b/.github/workflows/pre_merge.yaml index 201aaf089e3..07e1f67b44f 100644 --- a/.github/workflows/pre_merge.yaml +++ b/.github/workflows/pre_merge.yaml @@ -84,6 +84,38 @@ jobs: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov ./codecov -t ${{ secrets.CODECOV_TOKEN }} --sha $COMMIT_ID -U $HTTP_PROXY -f .tox/coverage_unit-test-${{ matrix.tox-env }}.xml -F ${{ matrix.tox-env }} + Intense-Unit-Test: + runs-on: [otx-gpu-a10g-1] + container: + image: "ubuntu:24.04" + needs: Code-Quality-Checks + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + include: + - python-version: "3.10" + tox-env: "py310" + - python-version: "3.11" + tox-env: "py311" + name: Intense-Unit-Test-with-Python${{ matrix.python-version }} + steps: + - name: Install dependencies + run: apt-get update && apt-get install -y libsqlite3-0 libsqlite3-dev libgl1 libglib2.0-0 + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Install Python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install tox + run: | + python -m pip install --require-hashes --no-deps -r .ci/requirements.txt + pip-compile --generate-hashes --output-file=/tmp/requirements.txt --extra=ci_tox pyproject.toml + python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt + rm /tmp/requirements.txt + - name: Run unit test + run: tox -vv -e intense-unit-test-${{ matrix.tox-env }} Integration-Test: if: | github.event.pull_request.draft == false && diff --git a/CHANGELOG.md b/CHANGELOG.md index bb157675bba..1a5dbbb26b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ All notable changes to this project will be documented in this file. () - Add OpenVINO inference for 3D Object Detection task () +- Add D-Fine Detection Algorithm + () ### Enhancements diff --git a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst index 3dd3fbc0349..925e4f119f1 100644 --- a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst +++ b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst @@ -73,6 +73,8 @@ We support the following ready-to-use model recipes: +------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ | `Object_Detection_ResNeXt101_ATSS `_ | ResNeXt101-ATSS | 434.75 | 344.0 | +------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ +| `D-Fine X Detection ` | D-Fine X | 202.486 | 240.0 | ++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+ Above table can be found using the following command diff --git a/pyproject.toml b/pyproject.toml index 56c71ef9ec5..fa4b942ff1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -398,6 +398,7 @@ convention = "google" markers = [ "gpu", # mark tests which require NVIDIA GPU "cpu", - "xpu", # mark tests which require Intel dGPU + "xpu", # mark tests which require Intel dGPU, + "intense", # intense unit tests which require better CI machines ] python_files = "tests/**/*.py" diff --git a/src/otx/algo/common/layers/transformer_layers.py b/src/otx/algo/common/layers/transformer_layers.py index 20ae281ecad..532f314128c 100644 --- a/src/otx/algo/common/layers/transformer_layers.py +++ b/src/otx/algo/common/layers/transformer_layers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # """Implementation of common transformer layers.""" @@ -10,6 +10,7 @@ from typing import Callable import torch +import torch.nn.functional as f from otx.algo.common.utils.utils import get_clones from otx.algo.modules.transformer import deformable_attention_core_func from torch import Tensor, nn @@ -306,6 +307,151 @@ def forward( return self.output_proj(output) +class MSDeformableAttentionV2(nn.Module): + """Multi-Scale Deformable Attention Module V2. + + Note: + This is different from vanilla MSDeformableAttention where it uses + distinct number of sampling points for features at different scales. + Refer to RTDETRv2. + + Args: + embed_dim (int): The number of expected features in the input. + num_heads (int): The number of heads in the multiheadattention models. + num_levels (int): The number of levels in MSDeformableAttention. + num_points_list (list[int]): Number of distinct points for each layer. Defaults to [3, 6, 3]. + """ + + def __init__( + self, + embed_dim: int = 256, + num_heads: int = 8, + num_levels: int = 4, + num_points_list: list[int] = [3, 6, 3], # noqa: B006 + ) -> None: + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points_list = num_points_list + + num_points_scale = [1 / n for n in num_points_list for _ in range(n)] + self.register_buffer( + "num_points_scale", + torch.tensor(num_points_scale, dtype=torch.float32), + ) + + self.total_points = num_heads * sum(num_points_list) + self.head_dim = embed_dim // num_heads + + self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2) + self.attention_weights = nn.Linear(embed_dim, self.total_points) + + self._reset_parameters() + + def _reset_parameters(self) -> None: + """Reset parameters of the model.""" + init.constant_(self.sampling_offsets.weight, 0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values # noqa: PD011 + grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1]) + scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1) + grid_init *= scaling + self.sampling_offsets.bias.data[...] = grid_init.flatten() + + # attention_weights + init.constant_(self.attention_weights.weight, 0) + init.constant_(self.attention_weights.bias, 0) + + def forward( + self, + query: Tensor, + reference_points: Tensor, + value: Tensor, + value_spatial_shapes: list[list[int]], + ) -> Tensor: + """Forward function of MSDeformableAttention. + + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, len_q = query.shape[:2] + _, n_head, c, _ = value[0].shape + num_points_list = self.num_points_list + + sampling_offsets = self.sampling_offsets(query).reshape( + bs, + len_q, + self.num_heads, + sum(self.num_points_list), + 2, + ) + + attention_weights = self.attention_weights(query).reshape( + bs, + len_q, + self.num_heads, + sum(self.num_points_list), + ) + attention_weights = f.softmax(attention_weights, dim=-1) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.tensor(value_spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2) + sampling_locations = ( + reference_points.reshape( + bs, + len_q, + 1, + self.num_levels, + 1, + 2, + ) + + sampling_offsets / offset_normalizer + ) + elif reference_points.shape[-1] == 4: + num_points_scale = self.num_points_scale.to(query).unsqueeze(-1) + offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * 0.5 + sampling_locations = reference_points[:, :, None, :, :2] + offset + else: + msg = (f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead.",) + raise ValueError(msg) + + # sampling_offsets [8, 480, 8, 12, 2] + sampling_grids = 2 * sampling_locations - 1 + + sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1) + sampling_locations_list = sampling_grids.split(num_points_list, dim=-2) + + sampling_value_list = [] + for level, (h, w) in enumerate(value_spatial_shapes): + value_l = value[level].reshape(bs * n_head, c, h, w) + sampling_grid_l = sampling_locations_list[level] + sampling_value_l = f.grid_sample( + value_l, + sampling_grid_l, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + + sampling_value_list.append(sampling_value_l) + + attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(bs * n_head, 1, len_q, sum(num_points_list)) + weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights + output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, len_q) + + return output.permute(0, 2, 1) + + class VisualEncoderLayer(nn.Module): """VisualEncoderLayer module consisting of MSDeformableAttention and feed-forward network. diff --git a/src/otx/algo/detection/backbones/hgnetv2.py b/src/otx/algo/detection/backbones/hgnetv2.py new file mode 100644 index 00000000000..65fd8408d7f --- /dev/null +++ b/src/otx/algo/detection/backbones/hgnetv2.py @@ -0,0 +1,640 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""High Performance GPU Net(HGNet) Backbone from PaddlePaddle. + +Modified from: + https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py + https://github.com/Peterande/D-FINE +""" + +from __future__ import annotations + +from typing import Any, ClassVar + +import torch +import torch.nn.functional as f +from torch import Tensor, nn + +from otx.algo.modules.norm import FrozenBatchNorm2d + +# Constants for initialization +kaiming_normal_ = nn.init.kaiming_normal_ +zeros_ = nn.init.zeros_ +ones_ = nn.init.ones_ + + +class LearnableAffineBlock(nn.Module): + """Learnable affine block. + + Args: + scale_value (float, optional): scale. Defaults to 1.0. + bias_value (float, optional): bias. Defaults to 0.0. + """ + + def __init__( + self, + scale_value: float = 1.0, + bias_value: float = 0.0, + ): + super().__init__() + self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True) + self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True) + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): input tensor. + + Returns: + Tensor: output tensor. + """ + return self.scale * x + self.bias + + +class ConvBNAct(nn.Module): + """Convolutional block with batch normalization and activation. + + TODO(Eugene): External LAB is embedded. 'Try'? switching to OTX ConvModule implementation in next PR. + + Args: + in_channels (int): In channels. + out_channels (int): Out Channels. + kernel_size (int): convolution kernel size. + stride (int, optional): stride. Defaults to 1. + groups (int, optional): number of conv groups. Defaults to 1. + use_act (bool, optional): Use ReLU activation. Defaults to True. + use_lab (bool, optional): Use learnable affine block. Defaults to False. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + groups: int = 1, + use_act: bool = True, + use_lab: bool = False, + ): + super().__init__() + self.use_act = use_act + self.use_lab = use_lab + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False, + ) + self.bn = nn.BatchNorm2d(out_channels) + if self.use_act: + self.act = nn.ReLU() + else: + self.act = nn.Identity() + if self.use_act and self.use_lab: + self.lab = LearnableAffineBlock() + else: + self.lab = nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): input tensor. + + Returns: + Tensor: output tensor. + """ + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + return self.lab(x) + + +class LightConvBNAct(nn.Module): + """Lightweight convolutional block with batch normalization and activation. + + Args: + in_chs (int): In channels. + out_chs (int): Out channels. + kernel_size (int): convolution kernel size. + use_lab (bool, optional): Use Learnable Affine Block. Defaults to False. + """ + + def __init__( + self, + in_chs: int, + out_chs: int, + kernel_size: int, + use_lab: bool = False, + ): + super().__init__() + self.conv1 = ConvBNAct( + in_chs, + out_chs, + kernel_size=1, + use_act=False, + use_lab=use_lab, + ) + self.conv2 = ConvBNAct( + out_chs, + out_chs, + kernel_size=kernel_size, + groups=out_chs, + use_act=True, + use_lab=use_lab, + ) + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): input tensor. + + Returns: + Tensor: output tensor. + """ + x = self.conv1(x) + return self.conv2(x) + + +class HGNetv2StemBlock(nn.Module): + """HGNetV2 stem block. + + Args: + in_chs (int): In channels. + mid_chs (int): Mid channels. + out_chs (int): Out channels. + use_lab (bool, optional): Use Learnable Affine Block. Defaults to False. + """ + + def __init__( + self, + in_chs: int, + mid_chs: int, + out_chs: int, + use_lab: bool = False, + ): + super().__init__() + self.stem1 = ConvBNAct( + in_chs, + mid_chs, + kernel_size=3, + stride=2, + use_lab=use_lab, + ) + self.stem2a = ConvBNAct( + mid_chs, + mid_chs // 2, + kernel_size=2, + stride=1, + use_lab=use_lab, + ) + self.stem2b = ConvBNAct( + mid_chs // 2, + mid_chs, + kernel_size=2, + stride=1, + use_lab=use_lab, + ) + self.stem3 = ConvBNAct( + mid_chs * 2, + mid_chs, + kernel_size=3, + stride=2, + use_lab=use_lab, + ) + self.stem4 = ConvBNAct( + mid_chs, + out_chs, + kernel_size=1, + stride=1, + use_lab=use_lab, + ) + self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True) + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): input tensor. + + Returns: + Tensor: output tensor. + """ + x = self.stem1(x) + x = f.pad(x, (0, 1, 0, 1)) + x2 = self.stem2a(x) + x2 = f.pad(x2, (0, 1, 0, 1)) + x2 = self.stem2b(x2) + x1 = self.pool(x) + x = torch.cat([x1, x2], dim=1) + x = self.stem3(x) + return self.stem4(x) + + +class HGBlock(nn.Module): + """HGNetV2 block. + + Args: + in_chs (int): In channels. + mid_chs (int): Mid channels. + out_chs (int): Out channels. + layer_num (int): Number of convolutional layers. + kernel_size (int, optional): kernel size. Defaults to 3. + residual (bool, optional): Add residual. Defaults to False. + light_block (bool, optional): Use LightConvBNAct layer. Defaults to False. + use_lab (bool, optional): User Learnable Affine Block. Defaults to False. + drop_path (float, optional): Dropout rate. Defaults to 0.0. + """ + + def __init__( + self, + in_chs: int, + mid_chs: int, + out_chs: int, + layer_num: int, + kernel_size: int = 3, + residual: bool = False, + light_block: bool = False, + use_lab: bool = False, + drop_path: float = 0.0, + ): + super().__init__() + self.residual = residual + + self.layers = nn.ModuleList() + for i in range(layer_num): + if light_block: + self.layers.append( + LightConvBNAct( + in_chs if i == 0 else mid_chs, + mid_chs, + kernel_size=kernel_size, + use_lab=use_lab, + ), + ) + else: + self.layers.append( + ConvBNAct( + in_chs if i == 0 else mid_chs, + mid_chs, + kernel_size=kernel_size, + stride=1, + use_lab=use_lab, + ), + ) + + # feature aggregation + total_chs = in_chs + layer_num * mid_chs + aggregation_squeeze_conv = ConvBNAct( + total_chs, + out_chs // 2, + kernel_size=1, + stride=1, + use_lab=use_lab, + ) + aggregation_excitation_conv = ConvBNAct( + out_chs // 2, + out_chs, + kernel_size=1, + stride=1, + use_lab=use_lab, + ) + self.aggregation = nn.Sequential( + aggregation_squeeze_conv, + aggregation_excitation_conv, + ) + + self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): input tensor. + + Returns: + Tensor: output tensor. + """ + identity = x + output = [x] + for layer in self.layers: + x = layer(x) + output.append(x) + x = torch.cat(output, dim=1) + x = self.aggregation(x) + if self.residual: + return self.drop_path(x) + identity + return x + + +class HGStage(nn.Module): + """HGNetV2 Stage Block. + + Args: + in_chs (int): In channels. + mid_chs (int): Mid channels. + out_chs (int): Out channels. + block_num (int): Number of blocks. + layer_num (int): Number of convolutional layers. + downsample (bool, optional): Downsample. Defaults to True. + light_block (bool, optional): Use LightConvBNAct layer. Defaults to False. + kernel_size (int, optional): kernel size. Defaults to 3. + use_lab (bool, optional): User Learnable Affine Block. Defaults to False. + drop_path (float, optional): Dropout rate. Defaults to 0.0. + """ + + def __init__( + self, + in_chs: int, + mid_chs: int, + out_chs: int, + block_num: int, + layer_num: int, + downsample: bool = True, + light_block: bool = False, + kernel_size: int = 3, + use_lab: bool = False, + drop_path: float = 0.0, + ): + super().__init__() + + self.downsample = ( + ConvBNAct( + in_chs, + in_chs, + kernel_size=3, + stride=2, + groups=in_chs, + use_act=False, + use_lab=use_lab, + ) + if downsample + else nn.Identity() + ) + + blocks_list = [ + HGBlock( + out_chs if i > 0 else in_chs, + mid_chs, + out_chs, + layer_num, + residual=i > 0, + kernel_size=kernel_size, + light_block=light_block, + use_lab=use_lab, + drop_path=drop_path, + ) + for i in range(block_num) + ] + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): input tensor. + + Returns: + Tensor: output tensor. + """ + x = self.downsample(x) + return self.blocks(x) + + +class HGNetv2Module(nn.Module): + """HGNetV2 Module. + + Args: + name (str): backbone name (i.e. B0, B2, B4, B5). + use_lab (bool, optional): User Learnable Affine Block. Defaults to False. + return_idx (list[int], optional): Feature Maps. Defaults to [1, 2, 3]. + freeze_stem_only (bool, optional): Freeze Stem only. Defaults to True. + freeze_at (int, optional): Freeze at which stage block. Defaults to 0. + freeze_norm (bool, optional): Freeze normalization or not. Defaults to True. + pretrained (bool, optional): Use backbone pretrained weight. Defaults to False. + """ + + arch_configs: ClassVar = { + "B0": { + "stem_channels": [3, 16, 16], + "stage_config": { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [16, 16, 64, 1, False, False, 3, 3], + "stage2": [64, 32, 256, 1, True, False, 3, 3], + "stage3": [256, 64, 512, 2, True, True, 5, 3], + "stage4": [512, 128, 1024, 1, True, True, 5, 3], + }, + "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth", + }, + "B2": { + "stem_channels": [3, 24, 32], + "stage_config": { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [32, 32, 96, 1, False, False, 3, 4], + "stage2": [96, 64, 384, 1, True, False, 3, 4], + "stage3": [384, 128, 768, 3, True, True, 5, 4], + "stage4": [768, 256, 1536, 1, True, True, 5, 4], + }, + "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B2_stage1.pth", + }, + "B4": { + "stem_channels": [3, 32, 48], + "stage_config": { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [48, 48, 128, 1, False, False, 3, 6], + "stage2": [128, 96, 512, 1, True, False, 3, 6], + "stage3": [512, 192, 1024, 3, True, True, 5, 6], + "stage4": [1024, 384, 2048, 1, True, True, 5, 6], + }, + "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B4_stage1.pth", + }, + "B5": { + "stem_channels": [3, 32, 64], + "stage_config": { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [64, 64, 128, 1, False, False, 3, 6], + "stage2": [128, 128, 512, 2, True, False, 3, 6], + "stage3": [512, 256, 1024, 5, True, True, 5, 6], + "stage4": [1024, 512, 2048, 2, True, True, 5, 6], + }, + "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B5_stage1.pth", + }, + } + + def __init__( + self, + name: str, + use_lab: bool = False, + return_idx: tuple = (1, 2, 3), + freeze_stem_only: bool = True, + freeze_at: int = 0, + freeze_norm: bool = True, + pretrained: bool = False, + ) -> None: + super().__init__() + self.use_lab = use_lab + self.return_idx = return_idx + + stem_channels = self.arch_configs[name]["stem_channels"] + stage_config = self.arch_configs[name]["stage_config"] + download_url = self.arch_configs[name]["url"] + + self._out_strides = [4, 8, 16, 32] + self._out_channels = [stage_config[k][2] for k in stage_config] + + # stem + self.stem = HGNetv2StemBlock( + in_chs=stem_channels[0], + mid_chs=stem_channels[1], + out_chs=stem_channels[2], + use_lab=use_lab, + ) + + # stages + self.stages = nn.ModuleList() + for k in stage_config: + ( + in_channels, + mid_channels, + out_channels, + block_num, + downsample, + light_block, + kernel_size, + layer_num, + ) = stage_config[k] + self.stages.append( + HGStage( + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample, + light_block, + kernel_size, + use_lab, + ), + ) + + if freeze_at >= 0: + self._freeze_parameters(self.stem) + if not freeze_stem_only: + for i in range(min(freeze_at + 1, len(self.stages))): + self._freeze_parameters(self.stages[i]) + + if freeze_norm: + self._freeze_norm(self) + + if pretrained: + state = torch.hub.load_state_dict_from_url( + download_url, + map_location="cpu", + ) + print(f"Loaded stage1 {name} HGNetV2 from URL.") + self.load_state_dict(state) + + def _freeze_norm(self, m: nn.Module) -> nn.Module: + """Freeze normalization layers. + + Args: + m (nn.Module): Normalization module. + + Returns: + nn.Module: Freezed normalization module. + """ + if isinstance(m, nn.BatchNorm2d): + m = FrozenBatchNorm2d(m.num_features) + else: + for name, child in m.named_children(): + _child = self._freeze_norm(child) + if _child is not child: + setattr(m, name, _child) + return m + + def _freeze_parameters(self, m: nn.Module) -> None: + """Freeze module parameters. + + Args: + m (nn.Module): Module to freeze. + """ + for p in m.parameters(): + p.requires_grad = False + + def forward(self, x: Tensor) -> list[Tensor]: + """Forward function. + + Args: + x (Tensor): Input tensor. + + Returns: + list[Tensor]: Output tensor. + """ + x = self.stem(x) + outs = [] + for idx, stage in enumerate(self.stages): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + return outs + + +class HGNetv2: + """HGNetV2 backbone.""" + + backbone_cfg: ClassVar[dict[str, Any]] = { + "dfine_hgnetv2_n": { + "name": "B0", + "return_idx": [2, 3], + "freeze_at": -1, + "freeze_norm": False, + "use_lab": True, + "freeze_stem_only": True, + "pretrained": True, + }, + "dfine_hgnetv2_s": { + "name": "B0", + "return_idx": [1, 2, 3], + "freeze_at": -1, + "freeze_norm": False, + "use_lab": True, + }, + "dfine_hgnetv2_m": { + "name": "B2", + "return_idx": [1, 2, 3], + "freeze_at": -1, + "freeze_norm": False, + "use_lab": True, + }, + "dfine_hgnetv2_l": { + "name": "B4", + "return_idx": [1, 2, 3], + "freeze_at": 0, + "freeze_norm": True, + "freeze_stem_only": True, + }, + "dfine_hgnetv2_x": { + "name": "B5", + "return_idx": [1, 2, 3], + "freeze_at": 0, + "freeze_norm": True, + "freeze_stem_only": True, + }, + } + + def __new__(cls, model_name: str) -> HGNetv2Module: + """Create HGNetV2 backbone. + + Args: + model_name (str): Model name. + + Returns: + HGNetv2Module: HGNetV2 backbone. + """ + return HGNetv2Module(**cls.backbone_cfg[model_name]) diff --git a/src/otx/algo/detection/d_fine.py b/src/otx/algo/detection/d_fine.py new file mode 100644 index 00000000000..5e16aa9c3c7 --- /dev/null +++ b/src/otx/algo/detection/d_fine.py @@ -0,0 +1,308 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""D-Fine model implementations.""" + +from __future__ import annotations + +import copy +import re +from typing import TYPE_CHECKING, Any, Literal + +import torch +from torch import Tensor, nn +from torchvision.ops import box_convert +from torchvision.tv_tensors import BoundingBoxFormat + +from otx.algo.detection.backbones.hgnetv2 import HGNetv2 +from otx.algo.detection.detectors import DETR +from otx.algo.detection.heads.dfine_decoder import DFINETransformer +from otx.algo.detection.losses.dfine_loss import DFINECriterion +from otx.algo.detection.necks.dfine_hybrid_encoder import HybridEncoder +from otx.core.config.data import TileConfig +from otx.core.data.entity.base import OTXBatchLossEntity +from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity +from otx.core.exporter.base import OTXModelExporter +from otx.core.exporter.native import OTXNativeModelExporter +from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.model.detection import ExplainableOTXDetModel + +if TYPE_CHECKING: + from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + + from otx.core.metrics import MetricCallable + from otx.core.schedulers import LRSchedulerListCallable + from otx.core.types.label import LabelInfoTypes + + +PRETRAINED_ROOT: str = "https://github.com/Peterande/storage/releases/download/dfinev1.0/" + +PRETRAINED_WEIGHTS: dict[str, str] = { + "dfine_hgnetv2_n": PRETRAINED_ROOT + "dfine_n_coco.pth", + "dfine_hgnetv2_s": PRETRAINED_ROOT + "dfine_s_coco.pth", + "dfine_hgnetv2_m": PRETRAINED_ROOT + "dfine_m_coco.pth", + "dfine_hgnetv2_l": PRETRAINED_ROOT + "dfine_l_coco.pth", + "dfine_hgnetv2_x": PRETRAINED_ROOT + "dfine_x_coco.pth", +} + + +class DFine(ExplainableOTXDetModel): + """OTX Detection model class for D-Fine.""" + + input_size_multiplier = 32 + mean: tuple[float, float, float] = (0.0, 0.0, 0.0) + std: tuple[float, float, float] = (255.0, 255.0, 255.0) + + def __init__( + self, + model_name: Literal[ + "dfine_hgnetv2_n", + "dfine_hgnetv2_s", + "dfine_hgnetv2_m", + "dfine_hgnetv2_l", + "dfine_hgnetv2_x", + ], + label_info: LabelInfoTypes, + input_size: tuple[int, int] = (640, 640), + optimizer: OptimizerCallable = DefaultOptimizerCallable, + scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, + metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, + multi_scale: bool = False, + torch_compile: bool = False, + tile_config: TileConfig = TileConfig(enable_tiler=False), + ) -> None: + self.load_from: str = PRETRAINED_WEIGHTS[model_name] + self.multi_scale = multi_scale + super().__init__( + model_name=model_name, + label_info=label_info, + input_size=input_size, + optimizer=optimizer, + scheduler=scheduler, + metric=metric, + torch_compile=torch_compile, + tile_config=tile_config, + ) + + def _build_model(self, num_classes: int) -> DETR: + backbone = HGNetv2(model_name=self.model_name) + encoder = HybridEncoder(model_name=self.model_name) + decoder = DFINETransformer( + model_name=self.model_name, + num_classes=num_classes, + ) + criterion = DFINECriterion( + weight_dict={ + "loss_vfl": 1, + "loss_bbox": 5, + "loss_giou": 2, + "loss_fgl": 0.15, + "loss_ddf": 1.5, + }, + alpha=0.75, + gamma=2.0, + reg_max=32, + num_classes=num_classes, + ) + + if self.model_name == "dfine_hgnetv2_n": + backbone_lr = 0.0004 + elif self.model_name == "dfine_hgnetv2_s": + backbone_lr = 0.0001 + elif self.model_name == "dfine_hgnetv2_m": + backbone_lr = 0.00002 + elif self.model_name in ("dfine_hgnetv2_l", "dfine_hgnetv2_x"): + backbone_lr = 0.0000125 + else: + msg = f"Unsupported model name: {self.model_name}" + raise ValueError(msg) + + optimizer_configuration = [ + # no weight decay for norm layers in backbone + {"params": "^(?=.*backbone)(?=.*norm).*$", "weight_decay": 0.0, "lr": backbone_lr}, + # lr for the backbone, but not norm layers is 0.00001 + {"params": "^(?=.*backbone)(?!.*norm).*$", "lr": backbone_lr}, + # no weight decay for norm layers and biases in encoder and decoder layers + {"params": "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$", "weight_decay": 0.0}, + ] + + return DETR( + multi_scale=None if self.multi_scale else [], + backbone=backbone, + encoder=encoder, + decoder=decoder, + criterion=criterion, + num_classes=num_classes, + optimizer_configuration=optimizer_configuration, + ) + + def _customize_inputs( + self, + entity: DetBatchDataEntity, + pad_size_divisor: int = 32, + pad_value: int = 0, + ) -> dict[str, Any]: + targets: list[dict[str, Any]] = [] + # prepare bboxes for the model + for bb, ll in zip(entity.bboxes, entity.labels): + # convert to cxcywh if needed + if len(scaled_bboxes := bb): + converted_bboxes = ( + box_convert(bb, in_fmt="xyxy", out_fmt="cxcywh") if bb.format == BoundingBoxFormat.XYXY else bb + ) + # normalize the bboxes + scaled_bboxes = converted_bboxes / torch.tensor(bb.canvas_size[::-1]).tile(2)[None].to( + converted_bboxes.device, + ) + targets.append({"boxes": scaled_bboxes, "labels": ll}) + + return { + "images": entity.images, + "targets": targets, + } + + def _customize_outputs( + self, + outputs: list[torch.Tensor] | dict, # type: ignore[override] + inputs: DetBatchDataEntity, + ) -> DetBatchPredEntity | OTXBatchLossEntity: + if self.training: + if not isinstance(outputs, dict): + raise TypeError(outputs) + + losses = OTXBatchLossEntity() + for k, v in outputs.items(): + if isinstance(v, list): + losses[k] = sum(v) + elif isinstance(v, Tensor): + losses[k] = v + else: + msg = "Loss output should be list or torch.tensor but got {type(v)}" + raise TypeError(msg) + return losses + + original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info] + scores, bboxes, labels = self.model.postprocess(outputs, original_sizes) + + return DetBatchPredEntity( + batch_size=len(outputs), + images=inputs.images, + imgs_info=inputs.imgs_info, + scores=scores, + bboxes=bboxes, + labels=labels, + ) + + def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: + """Configure an optimizer and learning-rate schedulers. + + Set up the optimizer and schedulers from the provided inputs. + Typically, a warmup scheduler is used initially, followed by the main scheduler. + + Returns: + Two list. The former is a list that contains an optimizer + The latter is a list of lr scheduler configs which has a dictionary format. + """ + param_groups = self._get_optim_params(self.model.optimizer_configuration, self.model) + optimizer = self.optimizer_callable(param_groups) + schedulers = self.scheduler_callable(optimizer) + + def ensure_list(item: Any) -> list: # noqa: ANN401 + return item if isinstance(item, list) else [item] + + lr_scheduler_configs = [] + for scheduler in ensure_list(schedulers): + lr_scheduler_config = {"scheduler": scheduler} + if hasattr(scheduler, "interval"): + lr_scheduler_config["interval"] = scheduler.interval + if hasattr(scheduler, "monitor"): + lr_scheduler_config["monitor"] = scheduler.monitor + lr_scheduler_configs.append(lr_scheduler_config) + + return [optimizer], lr_scheduler_configs + + @staticmethod + def _get_optim_params(cfg: list[dict[str, Any]] | None, model: nn.Module) -> list[dict[str, Any]]: + """Perform no bias decay and learning rate correction for the modules. + + The configuration dict should consist of regular expression pattern for the model parameters with "params" key. + Other optimizer parameters can be added as well. + + E.g.: + cfg = [{"params": "^((?!b).)*$", "lr": 0.01, "weight_decay": 0.0}, ..] + The above configuration is for the parameters that do not contain "b". + + ^(?=.*a)(?=.*b).*$ means including a and b + ^((?!b.)*a((?!b).)*$ means including a but not b + ^((?!b|c).)*a((?!b|c).)*$ means including a but not (b | c) + """ + if cfg is None: + return model.parameters() + + cfg = copy.deepcopy(cfg) + + param_groups = [] + visited = [] + for pg in cfg: + if "params" not in pg: + msg = f"The 'params' key should be included in the configuration, but got {pg.keys()}" + raise ValueError(msg) + pattern = pg["params"] + params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0} + pg["params"] = params.values() + param_groups.append(pg) + visited.extend(list(params.keys())) + + names = [k for k, v in model.named_parameters() if v.requires_grad] + + if len(visited) < len(names): + unseen = set(names) - set(visited) + params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen} + param_groups.append({"params": params.values()}) + visited.extend(list(params.keys())) + + return param_groups + + @property + def _exporter(self) -> OTXModelExporter: + """Creates OTXModelExporter object that can export the model.""" + if self.input_size is None: + msg = f"Input size attribute is not set for {self.__class__}" + raise ValueError(msg) + + return OTXNativeModelExporter( + task_level_export_parameters=self._export_parameters, + input_size=(1, 3, *self.input_size), + mean=self.mean, + std=self.std, + resize_mode="standard", + swap_rgb=False, + via_onnx=False, + onnx_export_configuration={ + "input_names": ["images"], + "output_names": ["bboxes", "labels", "scores"], + "dynamic_axes": { + "images": {0: "batch"}, + "boxes": {0: "batch", 1: "num_dets"}, + "labels": {0: "batch", 1: "num_dets"}, + "scores": {0: "batch", 1: "num_dets"}, + }, + "autograd_inlining": False, + "opset_version": 16, + }, + output_names=["bboxes", "labels", "scores"], + ) + + @property + def _optimization_config(self) -> dict[str, Any]: + """PTQ config for D-FINE.""" + return { + "model_type": "transformer", + "advanced_parameters": { + "activations_range_estimator_params": { + "min": {"statistics_type": "QUANTILE", "aggregator_type": "MIN", "quantile_outlier_prob": 1e-4}, + "max": {"statistics_type": "QUANTILE", "aggregator_type": "MAX", "quantile_outlier_prob": 1e-4}, + }, + }, + } diff --git a/src/otx/algo/detection/heads/__init__.py b/src/otx/algo/detection/heads/__init__.py index fd20bfe4808..c38d62b64f8 100644 --- a/src/otx/algo/detection/heads/__init__.py +++ b/src/otx/algo/detection/heads/__init__.py @@ -1,12 +1,21 @@ -# Copyright (C) 2023-2024 Intel Corporation +# Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 """Custom head implementations for detection task.""" from .atss_head import ATSSHead +from .dfine_decoder import DFINETransformer from .rtdetr_decoder import RTDETRTransformer from .rtmdet_head import RTMDetSepBNHead from .ssd_head import SSDHead from .yolo_head import YOLOHead from .yolox_head import YOLOXHead -__all__ = ["ATSSHead", "RTDETRTransformer", "RTMDetSepBNHead", "SSDHead", "YOLOHead", "YOLOXHead"] +__all__ = [ + "ATSSHead", + "DFINETransformer", + "RTDETRTransformer", + "RTMDetSepBNHead", + "SSDHead", + "YOLOHead", + "YOLOXHead", +] diff --git a/src/otx/algo/detection/heads/dfine_decoder.py b/src/otx/algo/detection/heads/dfine_decoder.py new file mode 100644 index 00000000000..d28e0cf3864 --- /dev/null +++ b/src/otx/algo/detection/heads/dfine_decoder.py @@ -0,0 +1,935 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""D-FINE Decoder. Modified from D-FINE (https://github.com/Peterande/D-FINE).""" + +from __future__ import annotations + +import copy +from collections import OrderedDict +from functools import partial +from typing import Any, Callable, ClassVar + +import torch +import torch.nn.functional as f +from torch import Tensor, nn +from torch.nn import init + +from otx.algo.common.layers.transformer_layers import MLP, MSDeformableAttentionV2 +from otx.algo.common.utils.utils import inverse_sigmoid +from otx.algo.detection.heads.rtdetr_decoder import get_contrastive_denoising_training_group +from otx.algo.detection.utils.utils import dfine_distance2bbox, dfine_weighting_function +from otx.algo.utils.weight_init import bias_init_with_prob + + +class TransformerDecoderLayer(nn.Module): + """Transformer Decoder Layer with MSDeformableAttentionV2. + + Args: + d_model (int): The number of expected features in the input. Defaults to 256. + n_head (int): The number of heads in the multiheadattention models. Defaults to 8. + dim_feedforward (int): The dimension of the feedforward network model. Defaults to 1024. + dropout (float): The dropout value. Defaults to 0.0. + activation (Callable[..., nn.Module] | None, optional): The activation function. Defaults to None. + n_levels (int): The number of levels in MSDeformableAttention. Defaults to 4. + num_points_list (list[int], optional): Number of distinct points for each layer. Defaults to [3, 6, 3]. + """ + + def __init__( + self, + d_model: int = 256, + n_head: int = 8, + dim_feedforward: int = 1024, + dropout: float = 0.0, + activation: Callable[..., nn.Module] = partial(nn.ReLU, inplace=True), + n_levels: int = 4, + num_points_list: list[int] = [3, 6, 3], # noqa: B006 + ): + super().__init__() + + # self attention + self.self_attn = nn.MultiheadAttention( + d_model, + n_head, + dropout=dropout, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # cross attention + self.cross_attn = MSDeformableAttentionV2( + d_model, + n_head, + n_levels, + num_points_list, + ) + self.dropout2 = nn.Dropout(dropout) + + # gate + self.gateway = Gate(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = activation() + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm(d_model) + + self._reset_parameters() + + def _reset_parameters(self) -> None: + """Reset parameters of the model.""" + init.xavier_uniform_(self.linear1.weight) + init.xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor: Tensor, pos: Tensor | None) -> Tensor: + """Add positional embedding to the input tensor.""" + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt: Tensor) -> Tensor: + """Forward function of feed forward network.""" + return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + + def forward( + self, + target: Tensor, + reference_points: Tensor, + value: Tensor, + spatial_shapes: list[list[int]], + attn_mask: Tensor | None = None, + query_pos_embed: Tensor | None = None, + ) -> Tensor: + """Forward function of the Transformer Decoder Layer. + + Args: + target (Tensor): target feature tensor. + reference_points (Tensor): reference points tensor. + value (Tensor): value tensor. + spatial_shapes (list[list[int]]): spatial shapes of the value tensor. + attn_mask (Tensor | None, optional): attention mask. Defaults to None. + query_pos_embed (Tensor | None, optional): query positional embedding. Defaults to None. + + Returns: + Tensor: updated target tensor. + """ + # self attention + q = k = self.with_pos_embed(target, query_pos_embed) + + target2, _ = self.self_attn(q, k, value=target, attn_mask=attn_mask) + target = target + self.dropout1(target2) + target = self.norm1(target) + + # cross attention + target2 = self.cross_attn( + self.with_pos_embed(target, query_pos_embed), + reference_points, + value, + spatial_shapes, + ) + + target = self.gateway(target, self.dropout2(target2)) + + # ffn + target2 = self.forward_ffn(target) + target = target + self.dropout4(target2) + return self.norm3(target.clamp(min=-65504, max=65504)) + + +class Gate(nn.Module): + """Target Gating Layers. + + Args: + d_model (int): The number of expected features in the input. + """ + + def __init__(self, d_model: int) -> None: + super().__init__() + self.gate = nn.Linear(2 * d_model, 2 * d_model) + bias = bias_init_with_prob(0.5) + init.constant_(self.gate.bias, bias) + init.constant_(self.gate.weight, 0) + self.norm = nn.LayerNorm(d_model) + + def forward(self, x1: Tensor, x2: Tensor) -> Tensor: + """Forward function of the gate. + + Args: + x1 (Tensor): first target input tensor. + x2 (Tensor): second target input tensor. + + Returns: + Tensor: gated target tensor. + """ + gate_input = torch.cat([x1, x2], dim=-1) + gates = torch.sigmoid(self.gate(gate_input)) + gate1, gate2 = gates.chunk(2, dim=-1) + return self.norm(gate1 * x1 + gate2 * x2) + + +class Integral(nn.Module): + """A static layer that calculates integral results from a distribution. + + This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`, + where Pr(n) is the softmax probability vector representing the discrete + distribution, and W(n) is the non-uniform Weighting Function. + + Args: + reg_max (int): Max number of the discrete bins. Default is 32. + It can be adjusted based on the dataset or task requirements. + """ + + def __init__(self, reg_max: int = 32): + super().__init__() + self.reg_max = reg_max + + def forward(self, x: Tensor, box_distance_weight: Tensor) -> Tensor: + """Forward function of the Integral layer.""" + shape = x.shape + x = f.softmax(x.reshape(-1, self.reg_max + 1), dim=1) + x = f.linear(x, box_distance_weight).reshape(-1, 4) + return x.reshape([*list(shape[:-1]), -1]) + + +class LQE(nn.Module): + """Localization Quality Estimation. + + Args: + k (int): number of edge points. + hidden_dim (int): The number of expected features in the input. + num_layers (int): The number of layers in the MLP. + reg_max (int): Max number of the discrete bins. + """ + + def __init__( + self, + k: int, + hidden_dim: int, + num_layers: int, + reg_max: int, + ): + super().__init__() + self.k = k + self.reg_max = reg_max + self.reg_conf = MLP( + input_dim=4 * (k + 1), + hidden_dim=hidden_dim, + output_dim=1, + num_layers=num_layers, + activation=partial(nn.ReLU, inplace=True), + ) + init.constant_(self.reg_conf.layers[-1].bias, 0) + init.constant_(self.reg_conf.layers[-1].weight, 0) + + def forward(self, scores: Tensor, pred_corners: Tensor) -> Tensor: + """Forward function of the LQE layer. + + Args: + scores (Tensor): Prediction scores. + pred_corners (Tensor): Predicted bounding box corners. + + Returns: + Tensor: Updated scores. + """ + b, num_pred, _ = pred_corners.size() + prob = f.softmax(pred_corners.reshape(b, num_pred, 4, self.reg_max + 1), dim=-1) + prob_topk, _ = prob.topk(self.k, dim=-1) + stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1) + quality_score = self.reg_conf(stat.reshape(b, num_pred, -1)) + return scores + quality_score + + +class TransformerDecoder(nn.Module): + """Transformer Decoder implementing Fine-grained Distribution Refinement (FDR). + + This decoder refines object detection predictions through iterative updates across multiple layers, + utilizing attention mechanisms, location quality estimators, and distribution refinement techniques + to improve bounding box accuracy and robustness. + + Args: + hidden_dim (int): The number of expected features in the input. + decoder_layer (nn.Module): The decoder layer module. + decoder_layer_wide (nn.Module): The wide decoder layer module. + num_layers (int): The number of layers. + num_head (int): The number of heads in the multi-head attention models. + reg_max (int): The number of discrete bins for bounding box regression. + reg_scale (Tensor): The curvature of the Weighting Function. + up (Tensor): The upper bound of the sequence. + eval_idx (int, optional): evaluation index. Defaults to -1. + """ + + def __init__( + self, + hidden_dim: int, + decoder_layer: nn.Module, + decoder_layer_wide: nn.Module, + num_layers: int, + num_head: int, + reg_max: int, + reg_scale: Tensor, + up: Tensor, + eval_idx: int = -1, + ) -> None: + super().__init__() + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.num_head = num_head + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + self.up, self.reg_scale, self.reg_max = up, reg_scale, reg_max + self.layers = nn.ModuleList( + [copy.deepcopy(decoder_layer) for _ in range(self.eval_idx + 1)] + + [copy.deepcopy(decoder_layer_wide) for _ in range(num_layers - self.eval_idx - 1)], + ) + self.lqe_layers = nn.ModuleList([copy.deepcopy(LQE(4, 64, 2, reg_max)) for _ in range(num_layers)]) + self.box_distance_weight = nn.Parameter( + dfine_weighting_function(self.reg_max, self.up, self.reg_scale), + requires_grad=False, + ) + + def value_op( + self, + memory: Tensor, + memory_spatial_shapes: list[list[int]], + ) -> tuple[Tensor, ...]: + """Preprocess values for MSDeformableAttention.""" + memory = memory.reshape(memory.shape[0], memory.shape[1], self.num_head, -1) + split_shape = [h * w for h, w in memory_spatial_shapes] + return memory.permute(0, 2, 3, 1).split(split_shape, dim=-1) + + def forward( + self, + target: Tensor, + ref_points_unact: Tensor, + memory: Tensor, + spatial_shapes: list[list[int]], + bbox_head: nn.Module, + score_head: nn.Module, + query_pos_head: nn.Module, + pre_bbox_head: nn.Module, + integral: nn.Module, + reg_scale: Tensor, + attn_mask: Tensor | None = None, + ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + """Forward function of the Transformer Decoder. + + Args: + target (Tensor): target feature tensor. + ref_points_unact (Tensor): reference points tensor. + memory (Tensor): memory tensor. + spatial_shapes (list[list[int]]): spatial shapes of the memory tensor. + bbox_head (nn.Module): bounding box head. + score_head (nn.Module): label score head. + query_pos_head (nn.Module): query position head. + pre_bbox_head (nn.Module): pre-bounding box head. + integral (nn.Module): integral module. + reg_scale (Tensor): number of discrete bins for bounding box regression. + attn_mask (Tensor | None, optional): attention mask tensor. Defaults to None. + + Returns: + tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: + out_bboxes (Tensor): bounding box predictions from all layers + out_logits (Tensor): label score predictions from all layers + out_corners (Tensor): bounding box corner predictions from all layers + out_refs (Tensor): reference points from all layers + pre_bboxes (Tensor): initial bounding box predictions + pre_scores (Tensor): initial label score predictions + """ + output = target + output_detach = pred_corners_undetach = 0 + value = self.value_op(memory, spatial_shapes) + + out_bboxes = [] + out_logits = [] + out_corners = [] + out_refs = [] + box_distance_weight = self.box_distance_weight + + ref_points_detach = f.sigmoid(ref_points_unact) + + for i, layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = query_pos_head(ref_points_detach).clamp(min=-10, max=10) + output = layer(output, ref_points_input, value, spatial_shapes, attn_mask, query_pos_embed) + + if i == 0: + # Initial bounding box predictions with inverse sigmoid refinement + pre_bboxes = f.sigmoid(pre_bbox_head(output) + inverse_sigmoid(ref_points_detach)) + pre_scores = score_head[0](output) + initial_ref_boxes = pre_bboxes.detach() + + # Refine bounding box corners using FDR, integrating previous layer's corrections + pred_corners = bbox_head[i](output + output_detach) + pred_corners_undetach + inter_ref_bbox = dfine_distance2bbox( + initial_ref_boxes, + integral(pred_corners, box_distance_weight), + reg_scale, + ) + + if self.training or i == self.eval_idx: + scores = score_head[i](output) + # Lqe does not affect the performance here. + scores = self.lqe_layers[i](scores, pred_corners) + out_logits.append(scores) + out_bboxes.append(inter_ref_bbox) + out_corners.append(pred_corners) + out_refs.append(initial_ref_boxes) + + if not self.training: + break + + pred_corners_undetach = pred_corners + ref_points_detach = inter_ref_bbox.detach() + output_detach = output.detach() + + return ( + torch.stack(out_bboxes), # out_bboxes + torch.stack(out_logits), # out_logits + torch.stack(out_corners), # out_corners + torch.stack(out_refs), # out_refs + pre_bboxes, + pre_scores, + ) + + +class DFINETransformerModule(nn.Module): + """D-FINE Transformer Module. + + Args: + num_classes (int, optional): num of classes. Defaults to 80. + hidden_dim (int, optional): Hidden dimension size.. Defaults to 256. + num_queries (int, optional): Number of queries. Defaults to 300. + feat_channels (list[int], optional): List of feature channels. Defaults to [256, 256, 256]. + num_points_list (list[int], optional): Number of points for each level. Defaults to [3, 6, 3]. + num_decoder_layers (int, optional): Number of decoder layers. Defaults to 6. + dim_feedforward (int, optional): Dimension of the feedforward network. Defaults to 1024. + dropout (float, optional): dropout rate. Defaults to 0.0. + activation (Callable[..., nn.Module], optional): activation layer. Defaults to nn.ReLU. + num_denoising (int, optional): Number of denoising samples. Defaults to 100. + label_noise_ratio (float, optional): Ratio of label noise. Defaults to 0.5. + box_noise_scale (float, optional): Scale of box noise. Defaults to 1.0. + eval_spatial_size (list[int], optional): Spatial size for evaluation. Defaults to [640, 640]. + eval_idx (int, optional): Evaluation index. Defaults to -1. + reg_scale (float, optional): The weight curvature. Defaults to 4.0. + reg_max (int, optional): The number of bins for box regression. Defaults to 32. + """ + + def __init__( + self, + num_classes: int = 80, + hidden_dim: int = 256, + num_queries: int = 300, + feat_channels: list[int] = [256, 256, 256], # noqa: B006 + feat_strides: list[int] = [8, 16, 32], # noqa: B006 + num_levels: int = 3, + num_points_list: list[int] = [3, 6, 3], # noqa: B006 + nhead: int = 8, + num_decoder_layers: int = 6, + dim_feedforward: int = 1024, + dropout: float = 0.0, + activation: Callable[..., nn.Module] = nn.ReLU, + num_denoising: int = 100, + label_noise_ratio: float = 0.5, + box_noise_scale: float = 1.0, + eval_spatial_size: list[int] = [640, 640], # noqa: B006 + eval_idx: int = -1, + reg_scale: float = 4.0, + reg_max: int = 32, + ): + super().__init__() + for _ in range(num_levels - len(feat_strides)): + feat_strides.append(feat_strides[-1] * 2) + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.feat_strides = feat_strides + self.num_levels = num_levels + self.num_classes = num_classes + self.num_queries = num_queries + self.eps = 1e-2 + self.num_decoder_layers = num_decoder_layers + self.eval_spatial_size = eval_spatial_size + self.reg_max = reg_max + + # backbone feature projection + self._build_input_proj_layer(feat_channels) + + # Transformer module + self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False) + self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False) + decoder_layer = TransformerDecoderLayer( + hidden_dim, + nhead, + dim_feedforward, + dropout, + activation, + num_levels, + num_points_list, + ) + decoder_layer_wide = TransformerDecoderLayer( + hidden_dim, + nhead, + dim_feedforward, + dropout, + activation, + num_levels, + num_points_list, + ) + self.decoder = TransformerDecoder( + hidden_dim, + decoder_layer, + decoder_layer_wide, + num_decoder_layers, + nhead, + reg_max, + self.reg_scale, + self.up, + eval_idx, + ) + # denoising + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + if num_denoising > 0: + self.denoising_class_embed = nn.Embedding(num_classes + 1, hidden_dim, padding_idx=num_classes) + init.normal_(self.denoising_class_embed.weight[:-1]) + + # decoder embedding + self.query_pos_head = MLP( + input_dim=4, + hidden_dim=2 * hidden_dim, + output_dim=hidden_dim, + num_layers=2, + activation=partial(nn.ReLU, inplace=True), + ) + + # encoder head + self.enc_output = nn.Sequential( + OrderedDict( + [ + ("proj", nn.Linear(hidden_dim, hidden_dim)), + ("norm", nn.LayerNorm(hidden_dim)), + ], + ), + ) + self.enc_score_head = nn.Linear(hidden_dim, num_classes) + self.enc_bbox_head = MLP( + input_dim=hidden_dim, + hidden_dim=hidden_dim, + output_dim=4, + num_layers=3, + activation=partial(nn.ReLU, inplace=True), + ) + + # decoder head + self.eval_idx = eval_idx if eval_idx >= 0 else num_decoder_layers + eval_idx + self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers)]) + # distribution refinement over num of self.reg_max bins + self.dec_bbox_head = nn.ModuleList( + [ + MLP( + input_dim=hidden_dim, + hidden_dim=hidden_dim, + output_dim=4 * (self.reg_max + 1), + num_layers=3, + activation=partial(nn.ReLU, inplace=True), + ) + for _ in range(num_decoder_layers) + ], + ) + self.pre_bbox_head = MLP( + input_dim=hidden_dim, + hidden_dim=hidden_dim, + output_dim=4, + num_layers=3, + activation=partial(nn.ReLU, inplace=True), + ) + + self.integral = Integral(self.reg_max) + + # init encoder output anchors and valid_mask + if self.eval_spatial_size: + self.anchors, self.valid_mask = self._generate_anchors() + + self._reset_parameters(feat_channels) + + def _reset_parameters(self, feat_channels: list[int]) -> None: + """Reset parameters of the module.""" + bias = bias_init_with_prob(0.01) + init.constant_(self.enc_score_head.bias, bias) + init.constant_(self.enc_bbox_head.layers[-1].weight, 0) + init.constant_(self.enc_bbox_head.layers[-1].bias, 0) + + init.constant_(self.pre_bbox_head.layers[-1].weight, 0) + init.constant_(self.pre_bbox_head.layers[-1].bias, 0) + + for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): + init.constant_(cls_.bias, bias) + if hasattr(reg_, "layers"): + init.constant_(reg_.layers[-1].weight, 0) + init.constant_(reg_.layers[-1].bias, 0) + + init.xavier_uniform_(self.enc_output[0].weight) + init.xavier_uniform_(self.query_pos_head.layers[0].weight) + init.xavier_uniform_(self.query_pos_head.layers[1].weight) + for m, in_channels in zip(self.input_proj, feat_channels): + if in_channels != self.hidden_dim: + init.xavier_uniform_(m[0].weight) + + def _build_input_proj_layer(self, feat_channels: list[int]) -> None: + """Build input projection layer.""" + self.input_proj = nn.ModuleList() + for in_channels in feat_channels: + if in_channels == self.hidden_dim: + self.input_proj.append(nn.Identity()) + else: + self.input_proj.append( + nn.Sequential( + OrderedDict( + [ + ("conv", nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), + ("norm", nn.BatchNorm2d(self.hidden_dim)), + ], + ), + ), + ) + + in_channels = feat_channels[-1] + + for _ in range(self.num_levels - len(feat_channels)): + if in_channels == self.hidden_dim: + self.input_proj.append(nn.Identity()) + else: + self.input_proj.append( + nn.Sequential( + OrderedDict( + [ + ("conv", nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)), + ("norm", nn.BatchNorm2d(self.hidden_dim)), + ], + ), + ), + ) + in_channels = self.hidden_dim + + def _get_encoder_input(self, feats: list[Tensor]) -> tuple[Tensor, list[list[int]]]: + """Flatten feature maps and get spatial shapes for encoder input. + + Args: + feats (list[Tensor]): List of feature maps. + + Returns: + tuple[Tensor, list[list[int]]]: + Tensor: Flattened feature maps. + list[list[int]]: List of spatial shapes for each feature map. + """ + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + if self.num_levels > len(proj_feats): + len_srcs = len(proj_feats) + for i in range(len_srcs, self.num_levels): + if i == len_srcs: + proj_feats.append(self.input_proj[i](feats[-1])) + else: + proj_feats.append(self.input_proj[i](proj_feats[-1])) + + # get encoder inputs + feat_flatten = [] + spatial_shapes = [] + for feat in proj_feats: + _, _, h, w = feat.shape + # [b, c, h, w] -> [b, h*w, c] + feat_flatten.append(feat.flatten(2).permute(0, 2, 1)) + # [num_levels, 2] + spatial_shapes.append([h, w]) + + # [b, l, c] + feat_flatten = torch.concat(feat_flatten, 1) + return feat_flatten, spatial_shapes + + def _generate_anchors( + self, + spatial_shapes: list[list[int]] | None = None, + grid_size: float = 0.05, + dtype: torch.dtype = torch.float32, + device: str = "cpu", + ) -> tuple[Tensor, Tensor]: + if spatial_shapes is None: + spatial_shapes = [] + eval_h, eval_w = self.eval_spatial_size + for s in self.feat_strides: + spatial_shapes.append([int(eval_h / s), int(eval_w / s)]) + + anchors = [] + for lvl, (h, w) in enumerate(spatial_shapes): + grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij") + grid_xy = torch.stack([grid_x, grid_y], dim=-1) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype) + wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl) + lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4) + anchors.append(lvl_anchors) + + tensor_anchors = torch.concat(anchors, dim=1).to(device) + valid_mask = ((tensor_anchors > self.eps) * (tensor_anchors < 1 - self.eps)).all(-1, keepdim=True) + tensor_anchors = torch.log(tensor_anchors / (1 - tensor_anchors)) + tensor_anchors = torch.where(valid_mask, tensor_anchors, torch.inf) + + return tensor_anchors, valid_mask + + def _get_decoder_input( + self, + memory: Tensor, + spatial_shapes: list[list[int]], + denoising_logits: Tensor | None = None, + denoising_bbox_unact: Tensor | None = None, + ) -> tuple[torch.Tensor, ...]: + # prepare input for decoder + if self.training or self.eval_spatial_size is None: + anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device) + else: + anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device) + + if memory.shape[0] > 1: + anchors = anchors.repeat(memory.shape[0], 1, 1) + + memory = valid_mask.to(memory.dtype) * memory + + output_memory = self.enc_output(memory) + enc_outputs_logits = self.enc_score_head(output_memory) + + enc_topk_bboxes_list, enc_topk_logits_list = [], [] + enc_topk_memory, enc_topk_logits, enc_topk_anchors = self._select_topk( + output_memory, + enc_outputs_logits, + anchors, + self.num_queries, + ) + + enc_topk_bbox_unact = self.enc_bbox_head(enc_topk_memory) + enc_topk_anchors + + if self.training: + enc_topk_bboxes = f.sigmoid(enc_topk_bbox_unact) + enc_topk_bboxes_list.append(enc_topk_bboxes) + enc_topk_logits_list.append(enc_topk_logits) + + content = enc_topk_memory.detach() + content = enc_topk_memory.detach() + + content = enc_topk_memory.detach() + + enc_topk_bbox_unact = enc_topk_bbox_unact.detach() + + if denoising_bbox_unact is not None: + enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1) + content = torch.concat([denoising_logits, content], dim=1) + + return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list + + def _select_topk( + self, + memory: Tensor, + outputs_logits: Tensor, + outputs_anchors_unact: Tensor, + topk: int, + ) -> tuple[Tensor, Tensor, Tensor]: + """Select top-k memory, logits, and anchors. + + Args: + memory (Tensor): memory tensor. + outputs_logits (Tensor): logits tensor. + outputs_anchors_unact (Tensor): unactivated anchors tensor. + topk (int): number of top-k to select. + + Returns: + tuple[Tensor, Tensor, Tensor]: + Tensor: top-k memory tensor. + Tensor: top-k logits tensor. + Tensor: top-k anchors tensor. + """ + _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1) + topk_anchors = outputs_anchors_unact.gather( + dim=1, + index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_anchors_unact.shape[-1]), + ) + + topk_logits = ( + outputs_logits.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1])) + if self.training + else None + ) + + topk_memory = memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, memory.shape[-1])) + + return topk_memory, topk_logits, topk_anchors + + def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None) -> dict[str, Tensor]: + """Forward pass of the DFine Transformer module.""" + # input projection and embedding + memory, spatial_shapes = self._get_encoder_input(feats) + + # prepare denoising training + if self.training and self.num_denoising > 0 and targets is not None: + denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = get_contrastive_denoising_training_group( + targets, + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=1.0, + ) + else: + denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + + init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = self._get_decoder_input( + memory, + spatial_shapes, + denoising_logits, + denoising_bbox_unact, + ) + + # decoder + out_bboxes, out_logits, out_corners, out_refs, pre_bboxes, pre_logits = self.decoder( + target=init_ref_contents, + ref_points_unact=init_ref_points_unact, + memory=memory, + spatial_shapes=spatial_shapes, + bbox_head=self.dec_bbox_head, + score_head=self.dec_score_head, + query_pos_head=self.query_pos_head, + pre_bbox_head=self.pre_bbox_head, + integral=self.integral, + reg_scale=self.reg_scale, + attn_mask=attn_mask, + ) + + if self.training and dn_meta is not None: + dn_pre_logits, pre_logits = torch.split(pre_logits, dn_meta["dn_num_split"], dim=1) + dn_pre_bboxes, pre_bboxes = torch.split(pre_bboxes, dn_meta["dn_num_split"], dim=1) + dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta["dn_num_split"], dim=2) + dn_out_logits, out_logits = torch.split(out_logits, dn_meta["dn_num_split"], dim=2) + + dn_out_corners, out_corners = torch.split(out_corners, dn_meta["dn_num_split"], dim=2) + dn_out_refs, out_refs = torch.split(out_refs, dn_meta["dn_num_split"], dim=2) + + if self.training: + out = { + "pred_logits": out_logits[-1], + "pred_boxes": out_bboxes[-1], + "pred_corners": out_corners[-1], + "ref_points": out_refs[-1], + "up": self.up, + "reg_scale": self.reg_scale, + } + out["aux_outputs"] = self._set_aux_loss2( + outputs_class=out_logits[:-1], + outputs_coord=out_bboxes[:-1], + outputs_corners=out_corners[:-1], + outputs_ref=out_refs[:-1], + teacher_corners=out_corners[-1], + teacher_logits=out_logits[-1], + ) + out["enc_aux_outputs"] = self._set_aux_loss( + enc_topk_logits_list, + enc_topk_bboxes_list, + ) + out["pre_outputs"] = { + "pred_logits": pre_logits, + "pred_boxes": pre_bboxes, + } + + if dn_meta is not None: + out["dn_outputs"] = self._set_aux_loss2( + outputs_class=dn_out_logits, + outputs_coord=dn_out_bboxes, + outputs_corners=dn_out_corners, + outputs_ref=dn_out_refs, + teacher_corners=dn_out_corners[-1], + teacher_logits=dn_out_logits[-1], + ) + out["dn_pre_outputs"] = { + "pred_logits": dn_pre_logits, + "pred_boxes": dn_pre_bboxes, + } + out["dn_meta"] = dn_meta + else: + out = { + "pred_logits": out_logits[-1], + "pred_boxes": out_bboxes[-1], + } + + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class: Tensor, outputs_coord: Tensor) -> list[dict[str, Tensor]]: + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)] + + @torch.jit.unused + def _set_aux_loss2( + self, + outputs_class: Tensor, + outputs_coord: Tensor, + outputs_corners: Tensor, + outputs_ref: Tensor, + teacher_corners: Tensor, + teacher_logits: Tensor, + ) -> list[dict[str, Tensor]]: + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + { + "pred_logits": a, + "pred_boxes": b, + "pred_corners": c, + "ref_points": d, + "teacher_corners": teacher_corners, + "teacher_logits": teacher_logits, + } + for a, b, c, d in zip(outputs_class, outputs_coord, outputs_corners, outputs_ref) + ] + + +class DFINETransformer: + """DFINETransformer factory for detection.""" + + decoder_cfg: ClassVar[dict[str, Any]] = { + "dfine_hgnetv2_n": { + "feat_channels": [128, 128], + "feat_strides": [16, 32], + "hidden_dim": 128, + "dim_feedforward": 512, + "num_levels": 2, + "num_decoder_layers": 3, + "eval_idx": -1, + "num_points_list": [6, 6], + "eval_spatial_size": [640, 640], + }, + "dfine_hgnetv2_s": { + "feat_channels": [256, 256, 256], + "num_decoder_layers": 3, + "eval_idx": -1, + "eval_spatial_size": [640, 640], + "num_points_list": [3, 6, 3], + }, + "dfine_hgnetv2_m": { + "num_decoder_layers": 4, + "eval_idx": -1, + "eval_spatial_size": [640, 640], + }, + "dfine_hgnetv2_l": {}, + "dfine_hgnetv2_x": { + "feat_channels": [384, 384, 384], + "reg_scale": 8.0, + "eval_idx": -1, + "eval_spatial_size": [640, 640], + }, + } + + def __new__(cls, model_name: str, num_classes: int) -> DFINETransformerModule: + """Constructor for DFINETransformerModule.""" + cfg = cls.decoder_cfg[model_name] + return DFINETransformerModule(num_classes=num_classes, **cfg) diff --git a/src/otx/algo/detection/layers/csp_layer.py b/src/otx/algo/detection/layers/csp_layer.py index 6a1c32f5693..d8f76a930a3 100644 --- a/src/otx/algo/detection/layers/csp_layer.py +++ b/src/otx/algo/detection/layers/csp_layer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # Copyright (c) OpenMMLab. All rights reserved. """Implementation of CSPLayer copied from mmdet.models.layers.csp_layer.py.""" @@ -193,7 +193,11 @@ def __init__( normalization=build_norm_layer(normalization, num_features=ch_out), activation=None, ) - self.act = activation() if activation else nn.Identity() + if isinstance(activation, type): + activation = activation() + if activation is None: + activation = nn.Identity() + self.act = activation def forward(self, x: Tensor) -> Tensor: """Forward function.""" @@ -378,7 +382,7 @@ def __init__( RepVggBlock( hidden_channels, hidden_channels, - activation=activation, + activation=build_activation_layer(activation), normalization=normalization, ) for _ in range(num_blocks) diff --git a/src/otx/algo/detection/losses/__init__.py b/src/otx/algo/detection/losses/__init__.py index 14ca6431030..44124aeddc4 100644 --- a/src/otx/algo/detection/losses/__init__.py +++ b/src/otx/algo/detection/losses/__init__.py @@ -1,9 +1,10 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # """Custom OTX Losses for Object Detection.""" from .atss_loss import ATSSCriterion +from .dfine_loss import DFINECriterion from .rtdetr_loss import DetrCriterion from .rtmdet_loss import RTMDetCriterion from .ssd_loss import SSDCriterion @@ -17,4 +18,5 @@ "SSDCriterion", "YOLOv9Criterion", "YOLOXCriterion", + "DFINECriterion", ] diff --git a/src/otx/algo/detection/losses/dfine_loss.py b/src/otx/algo/detection/losses/dfine_loss.py new file mode 100644 index 00000000000..8c438922f18 --- /dev/null +++ b/src/otx/algo/detection/losses/dfine_loss.py @@ -0,0 +1,501 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""D-FINE criterion implementations. Modified from https://github.com/Peterande/D-FINE.""" + + +from __future__ import annotations + +from typing import Callable + +import torch +import torch.distributed +import torch.nn.functional as f +from torch import Tensor, nn +from torchvision.ops import box_convert + +from otx.algo.common.utils.assigners.hungarian_matcher import HungarianMatcher +from otx.algo.common.utils.bbox_overlaps import bbox_overlaps +from otx.algo.detection.utils.utils import dfine_bbox2distance + + +class DFINECriterion(nn.Module): + """D-Fine criterion with FGL and DDF losses. + + TODO(Eugene): Consider merge with RTDETRCriterion in the next PR. + + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + + Args: + weight_dict (dict[str, int | float]): A dictionary containing the weights for different loss components. + alpha (float, optional): The alpha parameter for the loss calculation. Defaults to 0.2. + gamma (float, optional): The gamma parameter for the loss calculation. Defaults to 2.0. + num_classes (int, optional): The number of classes. Defaults to 80. + reg_max (int, optional): The maximum number of bin targets. Defaults to 32. + """ + + def __init__( + self, + weight_dict: dict[str, int | float], + alpha: float = 0.2, + gamma: float = 2.0, + num_classes: int = 80, + reg_max: int = 32, + ): + super().__init__() + self.num_classes = num_classes + self.matcher = HungarianMatcher( + cost_dict={ + "cost_class": 2.0, + "cost_bbox": 5.0, + "cost_giou": 2.0, + }, + ) + self.weight_dict = weight_dict + self.alpha = alpha + self.gamma = gamma + self.reg_max = reg_max + self.num_pos, self.num_neg = 0.0, 0.0 + + def loss_labels_vfl( + self, + outputs: dict[str, Tensor], + targets: list[dict[str, Tensor]], + indices: list[tuple[int, int]], + num_boxes: int, + ) -> dict[str, Tensor]: + """Varifocal Loss (VFL) for label prediction. + + Args: + outputs (dict[str, Tensor]): Model outputs. + targets (List[Dict[str, Tensor]]): List of target dictionaries. + indices (List[Tuple[int, int]]): List of tuples of indices. + num_boxes (int): Number of predicted boxes. + + Returns: + dict[str, Tensor]: The loss dictionary. + """ + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + ious = bbox_overlaps( + box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"), + box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"), + ) + ious = torch.diag(ious).detach() + + src_logits = outputs["pred_logits"] + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + target = f.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + + target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype) + target_score_o[idx] = ious.to(target_score_o.dtype) + target_score = target_score_o.unsqueeze(-1) * target + + pred_score = f.sigmoid(src_logits).detach() + weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score + + loss = f.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction="none") + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {"loss_vfl": loss} + + def loss_boxes( + self, + outputs: dict[str, Tensor], + targets: list[dict[str, Tensor]], + indices: list[tuple[int, int]], + num_boxes: int, + ) -> dict[str, Tensor]: + """Compute the losses re)L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + + Args: + outputs (dict[str, Tensor]): The outputs of the model. + targets (list[dict[str, Tensor]]): The targets. + indices (list[tuple[int, int]]): The indices of the matched boxes. + num_boxes (int): The number of boxes. + + Returns: + dict[str, Tensor]: The losses. + """ + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + losses = {} + loss_bbox = f.l1_loss(src_boxes, target_boxes, reduction="none") + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + bbox_overlaps( + box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"), + box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"), + mode="giou", + ), + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + + return losses + + def loss_local( + self, + outputs: dict[str, Tensor], + targets: list[dict[str, Tensor]], + indices: list[tuple[int, int]], + num_boxes: int, + temperature: int = 5, + ) -> dict[str, Tensor]: + """Compute Fine-Grained Localization (FGL) Loss and Decoupled Distillation Focal (DDF) Loss. + + Args: + outputs (dict[str, Tensor]): The outputs of the model. + targets (list[dict[str, Tensor]]): The targets. + indices (list[tuple[int, int]]): The indices of the matched boxes. + num_boxes (int): The number of boxes. + temperature (int, optional): Temperature for distillation. Defaults to 5. + + Returns: + dict[str, Tensor]: FGL and DDF losses. + """ + losses = {} + if "pred_corners" in outputs: + idx = self._get_src_permutation_idx(indices) + target_boxes = torch.cat( + [t["boxes"][i] for t, (_, i) in zip(targets, indices)], + dim=0, + ) + + pred_corners = outputs["pred_corners"][idx].reshape(-1, (self.reg_max + 1)) + ref_points = outputs["ref_points"][idx].detach() + with torch.no_grad(): + target_corners, weight_right, weight_left = dfine_bbox2distance( + ref_points, + box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"), + self.reg_max, + outputs["reg_scale"], + outputs["up"], + ) + + ious = torch.diag( + bbox_overlaps( + box_convert(outputs["pred_boxes"][idx], in_fmt="cxcywh", out_fmt="xyxy"), + box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"), + ), + ) + weight_targets = ious.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach() + + losses["loss_fgl"] = DFINECriterion.fgl_loss( + pred_corners, + target_corners, + weight_right, + weight_left, + weight_targets, + avg_factor=num_boxes, + ) + + # Compute Decoupled Distillation Focal (DDF) Loss + if "teacher_corners" in outputs and outputs["teacher_corners"] is not None: + pred_corners = outputs["pred_corners"].reshape(-1, (self.reg_max + 1)) + target_corners = outputs["teacher_corners"].reshape(-1, (self.reg_max + 1)) + if torch.equal(pred_corners, target_corners): + losses["loss_ddf"] = pred_corners.sum() * 0 + else: + weight_targets_local = outputs["teacher_logits"].sigmoid().max(dim=-1)[0] + + mask = torch.zeros_like(weight_targets_local, dtype=torch.bool) + mask[idx] = True + mask = mask.unsqueeze(-1).repeat(1, 1, 4).reshape(-1) + + weight_targets_local[idx] = ious.reshape_as(weight_targets_local[idx]).to( + weight_targets_local.dtype, + ) + weight_targets_local = weight_targets_local.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach() + + loss_match_local = ( + weight_targets_local + * (temperature**2) + * ( + nn.KLDivLoss(reduction="none")( + f.log_softmax(pred_corners / temperature, dim=1), + f.softmax(target_corners.detach() / temperature, dim=1), + ) + ).sum(-1) + ) + if "is_dn" not in outputs: + batch_scale = 8 / outputs["pred_boxes"].shape[0] # Avoid the influence of batch size per GPU + self.num_pos, self.num_neg = ( + (mask.sum() * batch_scale) ** 0.5, + ((~mask).sum() * batch_scale) ** 0.5, + ) + loss_match_local1 = loss_match_local[mask].mean() if mask.any() else 0 + loss_match_local2 = loss_match_local[~mask].mean() if (~mask).any() else 0 + losses["loss_ddf"] = (loss_match_local1 * self.num_pos + loss_match_local2 * self.num_neg) / ( + self.num_pos + self.num_neg + ) + + return losses + + def _get_src_permutation_idx( + self, + indices: list[tuple[Tensor, Tensor]], + ) -> tuple[Tensor, Tensor]: + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_go_indices( + self, + indices: list[tuple[Tensor, Tensor]], + indices_aux_list: list[list[tuple[Tensor, Tensor]]], + ) -> list[Tensor]: + """Get a matching union set across all decoder layers. + + Args: + indices: matching indices of the last decoder layer + indices_aux_list: matching indices of all decoder layers + """ + results = [] + for indices_aux in indices_aux_list: + indices = [ + (torch.cat([idx1[0], idx2[0]]), torch.cat([idx1[1], idx2[1]])) + for idx1, idx2 in zip(indices.copy(), indices_aux.copy()) + ] + + for ind in [torch.cat([idx[0][:, None], idx[1][:, None]], 1) for idx in indices]: + unique, counts = torch.unique(ind, return_counts=True, dim=0) + count_sort_indices = torch.argsort(counts, descending=True) + unique_sorted = unique[count_sort_indices] + column_to_row = {} + for idx in unique_sorted: + row_idx, col_idx = idx[0].item(), idx[1].item() + if row_idx not in column_to_row: + column_to_row[row_idx] = col_idx + final_rows = torch.tensor(list(column_to_row.keys()), device=ind.device) + final_cols = torch.tensor(list(column_to_row.values()), device=ind.device) + results.append((final_rows.long(), final_cols.long())) + return results + + @property + def _available_losses(self) -> tuple[Callable]: + return (self.loss_boxes, self.loss_labels_vfl, self.loss_local) # type: ignore[return-value] + + def forward( + self, + outputs: dict[str, Tensor], + targets: list[dict[str, Tensor]], + ) -> dict[str, Tensor]: + """This performs the loss computation. + + Args: + outputs (dict[str, torch.Tensor]): dict of tensors, see the output + specification of the model for the format + targets (list[dict[str, torch.Tensor]]): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + Returns: + dict[str, torch.Tensor]: dict of losses + """ + outputs_without_aux = {k: v for k, v in outputs.items() if "aux" not in k} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Get the matching union set across all decoder layers. + indices_aux_list, cached_indices, cached_indices_enc = [], [], [] + for aux_outputs in outputs["aux_outputs"] + [outputs["pre_outputs"]]: + indices_aux = self.matcher(aux_outputs, targets) + cached_indices.append(indices_aux) + indices_aux_list.append(indices_aux) + for aux_outputs in outputs["enc_aux_outputs"]: + indices_enc = self.matcher(aux_outputs, targets) + cached_indices_enc.append(indices_enc) + indices_aux_list.append(indices_enc) + indices_go = self._get_go_indices(indices, indices_aux_list) + + num_boxes_go = sum(len(x[0]) for x in indices_go) + num_boxes_go = torch.as_tensor( + [num_boxes_go], + dtype=torch.float, + device=next(iter(outputs.values())).device, + ) + num_boxes_go = torch.clamp(num_boxes_go, min=1).item() + + # Compute the average number of target boxes across all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + num_boxes = torch.clamp(num_boxes, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self._available_losses: + indices_in = indices_go if loss in [self.loss_boxes, self.loss_local] else indices + num_boxes_in = num_boxes_go if loss in [self.loss_boxes, self.loss_local] else num_boxes + l_dict = loss(outputs, targets, indices_in, num_boxes_in) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + losses.update(l_dict) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "aux_outputs" in outputs: + for i, aux_outputs in enumerate(outputs["aux_outputs"]): + aux_outputs["up"], aux_outputs["reg_scale"] = outputs["up"], outputs["reg_scale"] + for loss in self._available_losses: + if loss in [self.loss_boxes, self.loss_local]: + indices_in = indices_go + num_boxes_in = num_boxes_go + else: + indices_in = cached_indices[i] + num_boxes_in = num_boxes + l_dict = loss(aux_outputs, targets, indices_in, num_boxes_in) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f"_aux_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of auxiliary traditional head output at first decoder layer. + if "pre_outputs" in outputs: + aux_outputs = outputs["pre_outputs"] + for loss in self._available_losses: + if loss in [self.loss_boxes, self.loss_local]: + indices_in = indices_go + num_boxes_in = num_boxes_go + else: + indices_in = cached_indices[-1] + num_boxes_in = num_boxes + l_dict = loss(aux_outputs, targets, indices_in, num_boxes_in) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + "_pre": v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of encoder auxiliary losses. + if "enc_aux_outputs" in outputs: + enc_targets = targets + for i, aux_outputs in enumerate(outputs["enc_aux_outputs"]): + for loss in self._available_losses: + if loss == self.loss_boxes: + indices_in = indices_go + num_boxes_in = num_boxes_go + else: + indices_in = cached_indices_enc[i] + num_boxes_in = num_boxes + l_dict = loss(aux_outputs, enc_targets, indices_in, num_boxes_in) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f"_enc_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of cdn auxiliary losses. For dfine + if "dn_outputs" in outputs: + indices_dn = self.get_cdn_matched_indices(outputs["dn_meta"], targets) + dn_num_boxes = num_boxes * outputs["dn_meta"]["dn_num_group"] + dn_num_boxes = dn_num_boxes if dn_num_boxes > 0 else 1 + + for i, aux_outputs in enumerate(outputs["dn_outputs"]): + aux_outputs["is_dn"] = True + aux_outputs["up"], aux_outputs["reg_scale"] = outputs["up"], outputs["reg_scale"] + for loss in self._available_losses: + l_dict = loss(aux_outputs, targets, indices_dn, dn_num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f"_dn_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of auxiliary traditional head output at first decoder layer. + if "dn_pre_outputs" in outputs: + aux_outputs = outputs["dn_pre_outputs"] + for loss in self._available_losses: + l_dict = loss(aux_outputs, targets, indices_dn, dn_num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + "_dn_pre": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + @staticmethod + def get_cdn_matched_indices( + dn_meta: dict[str, list[Tensor]], + targets: list[dict[str, Tensor]], + ) -> list[tuple[torch.Tensor, torch.Tensor]]: + """get_cdn_matched_indices. + + Args: + dn_meta (dict[str, list[torch.Tensor]]): meta data for cdn + targets (list[dict[str, torch.Tensor]]): targets + """ + dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + num_gts = [len(t["labels"]) for t in targets] + device = targets[0]["labels"].device + + dn_match_indices = [] + for i, num_gt in enumerate(num_gts): + if num_gt > 0: + gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device) + gt_idx = gt_idx.tile(dn_num_group) + if len(dn_positive_idx[i]) != len(gt_idx): + msg = "The number of positive indices should be equal to the number of ground truths." + raise ValueError(msg) + dn_match_indices.append((dn_positive_idx[i], gt_idx)) + else: + dn_match_indices.append( + ( + torch.zeros(0, dtype=torch.int64, device=device), + torch.zeros(0, dtype=torch.int64, device=device), + ), + ) + + return dn_match_indices + + @staticmethod + def fgl_loss( + preds: Tensor, + targets: Tensor, + weight_right: Tensor, + weight_left: Tensor, + iou_weight: Tensor | None = None, + reduction: str = "sum", + avg_factor: float | None = None, + ) -> Tensor: + """Fine-Grained Localization (FGL) Loss. + + Args: + preds (Tensor): predicted distances + targets (Tensor): target distances + weight_right (Tensor): weight for right distance + weight_left (Tensor): weight for left distance + iou_weight (Tensor, optional): IoU weight. Defaults to None. + reduction (str, optional): reduction method. Defaults to "sum". + avg_factor (float, optional): average factor. Defaults to None. + + Returns: + Tensor: FGL loss + """ + dis_left = targets.long() + dis_right = dis_left + 1 + + loss_left = f.cross_entropy( + preds, + dis_left, + reduction="none", + ) * weight_left.reshape(-1) + + loss_right = f.cross_entropy( + preds, + dis_right, + reduction="none", + ) * weight_right.reshape(-1) + + loss = loss_left + loss_right + + if iou_weight is not None: + iou_weight = iou_weight.float() + loss = loss * iou_weight + + if avg_factor is not None: + loss = loss.sum() / avg_factor + elif reduction == "mean": + loss = loss.mean() + elif reduction == "sum": + loss = loss.sum() + + return loss diff --git a/src/otx/algo/detection/necks/dfine_hybrid_encoder.py b/src/otx/algo/detection/necks/dfine_hybrid_encoder.py new file mode 100644 index 00000000000..918cdfff878 --- /dev/null +++ b/src/otx/algo/detection/necks/dfine_hybrid_encoder.py @@ -0,0 +1,438 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""D-FINE Hybrid Encoder. Modified from D-FINE (https://github.com/Peterande/D-FINE).""" + +from __future__ import annotations + +import copy +from collections import OrderedDict +from functools import partial +from typing import Any, Callable, ClassVar + +import torch +import torch.nn.functional as f +from torch import Tensor, nn + +from otx.algo.common.layers.transformer_layers import TransformerEncoder, TransformerEncoderLayer +from otx.algo.detection.layers.csp_layer import CSPRepLayer +from otx.algo.detection.utils.utils import auto_pad +from otx.algo.modules.activation import build_activation_layer +from otx.algo.modules.conv_module import Conv2dModule +from otx.algo.modules.norm import build_norm_layer + + +class SCDown(nn.Module): + """SCDown downsampling module. + + Args: + c1 (int): Number of channels in the input feature map. + c2 (int): Number of channels produced by the convolution. + k (int): Kernel size of the convolving kernel. + s (int): Stride of the convolution. + normalization (Callable[..., nn.Module] | None): Normalization layer module. + """ + + def __init__( + self, + c1: int, + c2: int, + k: int, + s: int, + normalization: Callable[..., nn.Module] | None = None, + ) -> None: + super().__init__() + self.cv1 = Conv2dModule( + c1, + c2, + 1, + 1, + normalization=build_norm_layer(normalization, num_features=c2), + activation=None, + ) + self.cv2 = Conv2dModule( + c2, + c2, + k, + s, + padding=auto_pad(kernel_size=k), + groups=c2, + normalization=build_norm_layer(normalization, num_features=c2), + activation=None, + ) + + def forward(self, x: Tensor) -> Tensor: + """Forward pass.""" + return self.cv2(self.cv1(x)) + + +class RepNCSPELAN4(nn.Module): + """GELANModule from YOLOv9. + + Note: + Might not be replaceable as layer implementation is very different from GELANModule in YOLOv9. + + Args: + c1 (int): c1 channel size. Refer to GELAN paper. + c2 (int): c2 channel size. Refer to GELAN paper. + c3 (int): c3 channel size. Refer to GELAN paper. + c4 (int): c4 channel size. Refer to GELAN paper. + n (int, optional): number of blocks. Defaults to 3. + bias (bool, optional): use bias. Defaults to False. + activation (Callable[..., nn.Module] | None, optional): activation function. Defaults to None. + normalization (Callable[..., nn.Module] | None, optional): norm layer. Defaults to None. + """ + + def __init__( + self, + c1: int, + c2: int, + c3: int, + c4: int, + num_blocks: int = 3, + bias: bool = False, + activation: Callable[..., nn.Module] | None = None, + normalization: Callable[..., nn.Module] | None = None, + ) -> None: + super().__init__() + self.c = c3 // 2 + + self.cv1 = Conv2dModule( + c1, + c3, + 1, + 1, + bias=bias, + activation=build_activation_layer(activation), + normalization=build_norm_layer(normalization, num_features=c3), + ) + + self.cv2 = nn.Sequential( + CSPRepLayer( + c3 // 2, + c4, + num_blocks, + 1, + bias=bias, + activation=activation, + normalization=normalization, + ), + Conv2dModule( + c4, + c4, + 3, + 1, + padding=auto_pad(kernel_size=3), + bias=bias, + activation=build_activation_layer(activation), + normalization=build_norm_layer(normalization, num_features=c4), + ), + ) + + self.cv3 = nn.Sequential( + CSPRepLayer( + c4, + c4, + num_blocks, + 1, + bias=bias, + activation=activation, + normalization=normalization, + ), + Conv2dModule( + c4, + c4, + 3, + 1, + padding=auto_pad(kernel_size=3), + bias=bias, + activation=build_activation_layer(activation), + normalization=build_norm_layer(normalization, num_features=c4), + ), + ) + + self.cv4 = Conv2dModule( + c3 + (2 * c4), + c2, + 1, + 1, + bias=bias, + activation=build_activation_layer(activation), + normalization=build_norm_layer(normalization, num_features=c2), + ) + + def forward(self, x: Tensor) -> Tensor: + """Forward pass.""" + y = list(self.cv1(x).split((self.c, self.c), 1)) + y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) + return self.cv4(torch.cat(y, 1)) + + +class HybridEncoderModule(nn.Module): + """HybridEncoder for DFine. + + TODO(Eugene): Merge with current rtdetr.HybridEncoderModule in next PR. + + Args: + in_channels (list[int], optional): List of input channels for each feature map. + Defaults to [512, 1024, 2048]. + feat_strides (list[int], optional): List of stride values for + each feature map. Defaults to [8, 16, 32]. + hidden_dim (int, optional): Hidden dimension size. Defaults to 256. + nhead (int, optional): Number of attention heads in the transformer encoder. + Defaults to 8. + dim_feedforward (int, optional): Dimension of the feedforward network + in the transformer encoder. Defaults to 1024. + dropout (float, optional): Dropout rate. Defaults to 0.0. + enc_activation (Callable[..., nn.Module]): Activation layer module. + Defaults to ``nn.GELU``. + normalization (Callable[..., nn.Module]): Normalization layer module. + Defaults to ``partial(build_norm_layer, nn.BatchNorm2d, layer_name="norm")``. + use_encoder_idx (list[int], optional): List of indices of the encoder to use. + Defaults to [2]. + num_encoder_layers (int, optional): Number of layers in the transformer encoder. + Defaults to 1. + pe_temperature (float, optional): Temperature parameter for positional encoding. + Defaults to 10000. + expansion (float, optional): Expansion factor for the CSPRepLayer. + Defaults to 1.0. + depth_mult (float, optional): Depth multiplier for the CSPRepLayer. + Defaults to 1.0. + activation (Callable[..., nn.Module]): Activation layer module. + Defaults to ``nn.SiLU``. + eval_spatial_size (tuple[int, int] | None, optional): Spatial size for + evaluation. Defaults to None. + """ + + def __init__( + self, + in_channels: list[int] = [512, 1024, 2048], # noqa: B006 + feat_strides: list[int] = [8, 16, 32], # noqa: B006 + hidden_dim: int = 256, + nhead: int = 8, + dim_feedforward: int = 1024, + dropout: float = 0.0, + enc_activation: Callable[..., nn.Module] = nn.GELU, + normalization: Callable[..., nn.Module] = partial(build_norm_layer, nn.BatchNorm2d, layer_name="norm"), + use_encoder_idx: list[int] = [2], # noqa: B006 + num_encoder_layers: int = 1, + pe_temperature: int = 10000, + expansion: float = 1.0, + depth_mult: float = 1.0, + activation: Callable[..., nn.Module] = nn.SiLU, + eval_spatial_size: tuple[int, int] | None = None, + ): + super().__init__() + self.in_channels = in_channels + self.feat_strides = feat_strides + self.hidden_dim = hidden_dim + self.use_encoder_idx = use_encoder_idx + self.num_encoder_layers = num_encoder_layers + self.pe_temperature = pe_temperature + self.eval_spatial_size = eval_spatial_size + self.out_channels = [hidden_dim for _ in range(len(in_channels))] + self.out_strides = feat_strides + + # channel projection + self.input_proj = nn.ModuleList() + for in_channel in in_channels: + self.input_proj.append( + nn.Sequential( + OrderedDict( + [ + ("conv", nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)), + ("norm", nn.BatchNorm2d(hidden_dim)), + ], + ), + ), + ) + + # encoder transformer + encoder_layer = TransformerEncoderLayer( + hidden_dim, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=enc_activation, + ) + + self.encoder = nn.ModuleList( + [TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx))], + ) + + # top-down fpn + self.lateral_convs = nn.ModuleList() + self.fpn_blocks = nn.ModuleList() + for _ in range(len(in_channels) - 1, 0, -1): + self.lateral_convs.append( + Conv2dModule( + hidden_dim, + hidden_dim, + 1, + 1, + normalization=build_norm_layer(normalization, num_features=hidden_dim), + activation=None, + ), + ) + self.fpn_blocks.append( + RepNCSPELAN4( + hidden_dim * 2, + hidden_dim, + hidden_dim * 2, + round(expansion * hidden_dim // 2), + round(3 * depth_mult), + activation=activation, + normalization=normalization, + ), + ) + + # bottom-up pan + self.downsample_convs = nn.ModuleList() + self.pan_blocks = nn.ModuleList() + for _ in range(len(in_channels) - 1): + self.downsample_convs.append( + nn.Sequential( + SCDown( + hidden_dim, + hidden_dim, + 3, + 2, + normalization=normalization, + ), + ), + ) + self.pan_blocks.append( + RepNCSPELAN4( + hidden_dim * 2, + hidden_dim, + hidden_dim * 2, + round(expansion * hidden_dim // 2), + round(3 * depth_mult), + activation=activation, + normalization=normalization, + ), + ) + + self._reset_parameters() + + def _reset_parameters(self) -> None: + """Reset parameters.""" + if self.eval_spatial_size: + for idx in self.use_encoder_idx: + stride = self.feat_strides[idx] + pos_embed = self.build_2d_sincos_position_embedding( + self.eval_spatial_size[1] // stride, + self.eval_spatial_size[0] // stride, + self.hidden_dim, + self.pe_temperature, + ) + setattr(self, f"pos_embed{idx}", pos_embed) + + @staticmethod + def build_2d_sincos_position_embedding( + w: int, + h: int, + embed_dim: int = 256, + temperature: float = 10000.0, + ) -> Tensor: + """Build 2D sin-cos position embedding.""" + grid_w = torch.arange(int(w), dtype=torch.float32) + grid_h = torch.arange(int(h), dtype=torch.float32) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") + if embed_dim % 4 != 0: + msg = "Embed dimension must be divisible by 4 for 2D sin-cos position embedding" + raise ValueError(msg) + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = 1.0 / (temperature**omega) + + out_w = grid_w.flatten()[..., None] @ omega[None] + out_h = grid_h.flatten()[..., None] @ omega[None] + + return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :] + + def forward(self, feats: Tensor) -> list[Tensor]: + """Forward pass.""" + if len(feats) != len(self.in_channels): + msg = f"Input feature size {len(feats)} does not match the number of input channels {len(self.in_channels)}" + raise ValueError(msg) + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + + # encoder + if self.num_encoder_layers > 0: + for i, enc_ind in enumerate(self.use_encoder_idx): + h, w = proj_feats[enc_ind].shape[2:] + # flatten [B, C, H, W] to [B, HxW, C] + src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1) + if self.training or self.eval_spatial_size is None: + pos_embed = self.build_2d_sincos_position_embedding(w, h, self.hidden_dim, self.pe_temperature).to( + src_flatten.device, + ) + else: + pos_embed = getattr(self, f"pos_embed{enc_ind}").to(src_flatten.device) + + memory = self.encoder[i](src_flatten, pos_embed=pos_embed) + proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous() + + # broadcasting and fusion + inner_outs = [proj_feats[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = proj_feats[idx - 1] + feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh) + inner_outs[0] = feat_heigh + upsample_feat = f.interpolate(feat_heigh, scale_factor=2.0, mode="nearest") + inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](torch.concat([upsample_feat, feat_low], dim=1)) + inner_outs.insert(0, inner_out) + + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsample_convs[idx](feat_low) + out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1)) + outs.append(out) + + return outs + + +class HybridEncoder: + """HybridEncoder factory for D-Fine detection.""" + + encoder_cfg: ClassVar[dict[str, Any]] = { + "dfine_hgnetv2_n": { + "in_channels": [512, 1024], + "feat_strides": [16, 32], + "hidden_dim": 128, + "use_encoder_idx": [1], + "dim_feedforward": 512, + "expansion": 0.34, + "depth_mult": 0.5, + "eval_spatial_size": [640, 640], + }, + "dfine_hgnetv2_s": { + "in_channels": [256, 512, 1024], + "hidden_dim": 256, + "expansion": 0.5, + "depth_mult": 0.34, + "eval_spatial_size": [640, 640], + }, + "dfine_hgnetv2_m": { + "in_channels": [384, 768, 1536], + "hidden_dim": 256, + "depth_mult": 0.67, + "eval_spatial_size": [640, 640], + }, + "dfine_hgnetv2_l": {}, + "dfine_hgnetv2_x": { + "hidden_dim": 384, + "dim_feedforward": 2048, + }, + } + + def __new__(cls, model_name: str) -> HybridEncoderModule: + """Constructor for HybridEncoder.""" + if model_name not in cls.encoder_cfg: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + return HybridEncoderModule(**cls.encoder_cfg[model_name]) diff --git a/src/otx/algo/detection/utils/utils.py b/src/otx/algo/detection/utils/utils.py index 143ea1571cd..78c73f7eace 100644 --- a/src/otx/algo/detection/utils/utils.py +++ b/src/otx/algo/detection/utils/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # Copyright (c) OpenMMLab. All rights reserved. """Utils for otx detection algo. @@ -6,6 +6,7 @@ Reference : - https://github.com/open-mmlab/mmdetection/blob/v3.2.0/mmdet/models/utils. - https://github.com/open-mmlab/mmdeploy/blob/v1.3.1/mmdeploy/codebase/mmdet/structures/bbox/transforms. + - https://github.com/Peterande/D-FINE/blob/master/src/zoo/dfine/dfine_utils.py """ from __future__ import annotations @@ -16,6 +17,7 @@ from einops import rearrange from torch import Tensor, nn from torch.autograd import Function +from torchvision.ops import box_convert from otx.algo.utils.mmengine_utils import InstanceData from otx.core.data.entity.detection import DetBatchDataEntity @@ -334,3 +336,164 @@ def set_info_into_instance(layer_dict: dict[str, Any]) -> nn.Module: for k, v in layer_dict.items(): setattr(layer, k, v) return layer + + +def dfine_weighting_function(reg_max: int, up: Tensor, reg_scale: Tensor) -> Tensor: + """Generates the non-uniform Weighting Function W(n) for bounding box regression. + + Args: + reg_max (int): Max number of the discrete bins. + up (Tensor): Controls upper bounds of the sequence, where maximum offset is ±up * H / W. + reg_scale (Tensor): Controls the curvature of the Weighting Function. + Larger values result in flatter weights near the central axis W(reg_max/2)=0 + and steeper weights at both ends. + deploy (bool): If True, uses deployment mode settings. + + Returns: + Tensor: Sequence of Weighting Function. + """ + upper_bound1 = abs(up[0]) * abs(reg_scale) + upper_bound2 = abs(up[0]) * abs(reg_scale) * 2 + step = (upper_bound1 + 1) ** (2 / (reg_max - 2)) + left_values = [-((step) ** i) + 1 for i in range(reg_max // 2 - 1, 0, -1)] + right_values = [(step) ** i - 1 for i in range(1, reg_max // 2)] + return torch.cat( + [ + -upper_bound2, + torch.cat(left_values), + torch.zeros_like(up[0][None]), + torch.cat(right_values), + upper_bound2, + ], + 0, + ) + + +def dfine_distance2bbox(points: Tensor, distance: Tensor, reg_scale: Tensor) -> Tensor: + """Decodes edge-distances into bounding box coordinates. + + Args: + points (Tensor): (B, N, 4) or (N, 4) format, representing [x, y, w, h], + where (x, y) is the center and (w, h) are width and height. + distance (Tensor): (B, N, 4) or (N, 4), representing distances from the + point to the left, top, right, and bottom boundaries. + + reg_scale (float): Controls the curvature of the Weighting Function. + + Returns: + Tensor: Bounding boxes in (N, 4) or (B, N, 4) format [cx, cy, w, h]. + """ + reg_scale = abs(reg_scale) + x1 = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale) + y1 = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale) + x2 = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale) + y2 = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale) + + bboxes = torch.stack([x1, y1, x2, y2], -1) + return box_convert(bboxes, in_fmt="xyxy", out_fmt="cxcywh") + + +def dfine_bbox2distance( + points: Tensor, + bbox: Tensor, + reg_max: int, + reg_scale: Tensor, + up: Tensor, + eps: float = 0.1, +) -> tuple[Tensor, Tensor, Tensor]: + """Converts bounding box coordinates to distances from a reference point. + + Refer to D-Fine: https://github.com/Peterande/D-FINE. + + Args: + points (Tensor): (n, 4) [x, y, w, h], where (x, y) is the center. + bbox (Tensor): (n, 4) bounding boxes in "xyxy" format. + reg_max (float): Maximum bin value. + reg_scale (float): Controling curvarture of W(n). + up (Tensor): Controling upper bounds of W(n). + eps (float): Small value to ensure target < reg_max. + + Returns: + Tuple[Tensor, Tensor, Tensor]: + - indices (Tensor): Index of the left bin closest to each GT value, shape (N, ). + - weight_right (Tensor): Weight assigned to the right bin, shape (N, ). + - weight_left (Tensor): Weight assigned to the left bin, shape (N, ). + """ + + def _translate_gt(gt: Tensor, reg_max: int, reg_scale: Tensor, up: Tensor) -> tuple[Tensor, Tensor, Tensor]: + """Decodes bounding box ground truth (GT) values into distribution-based GT representations. + + This function maps continuous GT values into discrete distribution bins, which can be used + for regression tasks in object detection models. + + It calculates the indices of the closest bins to each GT value and assigns interpolation weights + to these bins based on their proximity to the GT value. + + In the paper: + 'a' (up) controlling the upper bounds. + 'c' (reg_scale) controlling the curvature. + + Args: + gt (Tensor): Ground truth bounding box values, shape (N, ). + reg_max (int): Maximum number of discrete bins for the distribution. + reg_scale (Tensor): Controls the curvature of the Weighting Function. + up (Tensor): Controls the upper bounds of the Weighting Function. + + Returns: + Tuple[Tensor, Tensor, Tensor]: + - indices (Tensor): Index of the left bin closest to each GT value, shape (N, ). + - weight_right (Tensor): Weight assigned to the right bin, shape (N, ). + - weight_left (Tensor): Weight assigned to the left bin, shape (N, ). + """ + gt = gt.reshape(-1) + function_values = dfine_weighting_function(reg_max, up, reg_scale) + + # Find the closest left-side indices for each value + diffs = function_values.unsqueeze(0) - gt.unsqueeze(1) + mask = diffs <= 0 + closest_left_indices = torch.sum(mask, dim=1) - 1 + + # Calculate the weights for the interpolation + indices = closest_left_indices.float() + + weight_right = torch.zeros_like(indices) + weight_left = torch.zeros_like(indices) + + valid_idx_mask = (indices >= 0) & (indices < reg_max) + valid_indices = indices[valid_idx_mask].long() + + # Obtain distances + left_values = function_values[valid_indices] + right_values = function_values[valid_indices + 1] + + left_diffs = torch.abs(gt[valid_idx_mask] - left_values) + right_diffs = torch.abs(right_values - gt[valid_idx_mask]) + + # Valid weights + weight_right[valid_idx_mask] = left_diffs / (left_diffs + right_diffs) + weight_left[valid_idx_mask] = 1.0 - weight_right[valid_idx_mask] + + # Invalid weights (out of range) + invalid_idx_mask_neg = indices < 0 + weight_right[invalid_idx_mask_neg] = 0.0 + weight_left[invalid_idx_mask_neg] = 1.0 + indices[invalid_idx_mask_neg] = 0.0 + + invalid_idx_mask_pos = indices >= reg_max + weight_right[invalid_idx_mask_pos] = 1.0 + weight_left[invalid_idx_mask_pos] = 0.0 + indices[invalid_idx_mask_pos] = reg_max - 0.1 + + return indices, weight_right, weight_left + + reg_scale = abs(reg_scale) + # ϕ = (dᴳᵀ- d⁰) / {H, H, W, W} + left = (points[:, 0] - bbox[:, 0]) / (points[..., 2] / reg_scale + 1e-16) - 0.5 * reg_scale + top = (points[:, 1] - bbox[:, 1]) / (points[..., 3] / reg_scale + 1e-16) - 0.5 * reg_scale + right = (bbox[:, 2] - points[:, 0]) / (points[..., 2] / reg_scale + 1e-16) - 0.5 * reg_scale + bottom = (bbox[:, 3] - points[:, 1]) / (points[..., 3] / reg_scale + 1e-16) - 0.5 * reg_scale + four_lens = torch.stack([left, top, right, bottom], -1) + four_lens, weight_right, weight_left = _translate_gt(four_lens, reg_max, reg_scale, up) + if reg_max is not None: + four_lens = four_lens.clamp(min=0, max=reg_max - eps) + return four_lens.reshape(-1).detach(), weight_right.detach(), weight_left.detach() diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py index 8fdf26736a7..ca51bb71725 100644 --- a/src/otx/core/data/transform_libs/torchvision.py +++ b/src/otx/core/data/transform_libs/torchvision.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023-2024 Intel Corporation +# Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 """Helper to support TorchVision data transform functions.""" @@ -3921,3 +3921,44 @@ def _dispatch_transform(cls, cfg_transform: DictConfig | dict | tvt_v2.Transform raise TypeError(msg) return transform + + +class RandomIoUCrop(tvt_v2.RandomIoUCrop): + """Random IoU crop with the option to set probability. + + Args: + min_scale (float, optional): the same as RandomIoUCrop. Defaults to 0.3. + max_scale (float, optional): the same as RandomIoUCrop. Defaults to 1. + min_aspect_ratio (float, optional): the same as RandomIoUCrop. Defaults to 0.5. + max_aspect_ratio (float, optional): the same as RandomIoUCrop. Defaults to 2. + sampler_options (list[float] | None, optional): the same as RandomIoUCrop. Defaults to None. + trials (int, optional): the same as RandomIoUCrop. Defaults to 40. + p (float, optional): probability. Defaults to 1.0. + """ + + def __init__( + self, + min_scale: float = 0.3, + max_scale: float = 1, + min_aspect_ratio: float = 0.5, + max_aspect_ratio: float = 2, + sampler_options: list[float] | None = None, + trials: int = 40, + p: float = 1.0, + ): + super().__init__( + min_scale, + max_scale, + min_aspect_ratio, + max_aspect_ratio, + sampler_options, + trials, + ) + self.p = p + + def __call__(self, *inputs: Any) -> Any: # noqa: ANN401 + """Apply the transform to the given inputs.""" + if torch.rand(1) >= self.p: + return inputs if len(inputs) > 1 else inputs[0] + + return super().forward(*inputs) diff --git a/src/otx/recipe/detection/dfine_x.yaml b/src/otx/recipe/detection/dfine_x.yaml new file mode 100644 index 00000000000..4be4342e94c --- /dev/null +++ b/src/otx/recipe/detection/dfine_x.yaml @@ -0,0 +1,129 @@ +model: + class_path: otx.algo.detection.d_fine.DFine + init_args: + model_name: dfine_hgnetv2_x + label_info: 80 + multi_scale: true + + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + scheduler: + class_path: otx.core.schedulers.LinearWarmupSchedulerCallable + init_args: + num_warmup_steps: 100 + main_scheduler_callable: + class_path: lightning.pytorch.cli.ReduceLROnPlateau + init_args: + mode: max + factor: 0.1 + patience: 6 + monitor: val/map_50 +engine: + task: DETECTION + device: auto + +callback_monitor: val/map_50 + +data: ../_base_/data/torchvision_base.yaml +overrides: + callbacks: + - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling + init_args: + max_interval: 1 + min_lrschedule_patience: 3 + - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup + init_args: + warmup_iters: 100 + warmup_epochs: 7 + + data: + input_size: + - 640 + - 640 + task: DETECTION + stack_images: true + data_format: coco_instances + train_subset: + batch_size: 8 + num_workers: 4 + to_tv_image: true + transforms: + - class_path: torchvision.transforms.v2.RandomPhotometricDistort + init_args: + p: 0.5 + - class_path: torchvision.transforms.v2.RandomZoomOut + init_args: + fill: 0 + - class_path: otx.core.data.transform_libs.torchvision.RandomIoUCrop + init_args: + p: 0.8 + - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes + init_args: + min_size: 1 + - class_path: otx.core.data.transform_libs.torchvision.RandomFlip + init_args: + prob: 0.5 + - class_path: otx.core.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + transform_bbox: true + keep_ratio: false + is_numpy_to_tvtensor: true + - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion + enable: false + init_args: + is_numpy_to_tvtensor: true + - class_path: otx.core.data.transform_libs.torchvision.RandomAffine + enable: false + init_args: + is_numpy_to_tvtensor: true + - class_path: torchvision.transforms.v2.GaussianBlur + enable: false + init_args: + kernel_size: 5 + - class_path: torchvision.transforms.v2.RandomVerticalFlip + enable: false + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: true + - class_path: torchvision.transforms.v2.GaussianNoise + enable: false + - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes + init_args: + min_size: 1 + sampler: + class_path: otx.algo.samplers.balanced_sampler.BalancedSampler + + val_subset: + batch_size: 8 + to_tv_image: true + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + is_numpy_to_tvtensor: true + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: true + + test_subset: + batch_size: 8 + to_tv_image: true + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + is_numpy_to_tvtensor: true + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: true diff --git a/src/otx/recipe/detection/dfine_x_tile.yaml b/src/otx/recipe/detection/dfine_x_tile.yaml new file mode 100644 index 00000000000..74523c361de --- /dev/null +++ b/src/otx/recipe/detection/dfine_x_tile.yaml @@ -0,0 +1,125 @@ +model: + class_path: otx.algo.detection.d_fine.DFine + init_args: + model_name: dfine_hgnetv2_x + label_info: 80 + multi_scale: false + + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: 0.00025 + betas: [0.9, 0.999] + weight_decay: 0.000125 + + scheduler: + class_path: otx.core.schedulers.LinearWarmupSchedulerCallable + init_args: + num_warmup_steps: 20 + main_scheduler_callable: + class_path: lightning.pytorch.cli.ReduceLROnPlateau + init_args: + mode: max + factor: 0.1 + patience: 9 + monitor: val/map_50 + min_lr: 2e-06 +engine: + task: DETECTION + device: auto + +callback_monitor: val/map_50 + +data: ../_base_/data/detection_tile.yaml +overrides: + reset: + - data.train_subset.transforms + - data.val_subset.transforms + - data.test_subset.transforms + + data: + input_size: + - 640 + - 640 + task: DETECTION + stack_images: true + data_format: coco_instances + train_subset: + batch_size: 8 + num_workers: 4 + to_tv_image: true + transforms: + - class_path: torchvision.transforms.v2.RandomPhotometricDistort + init_args: + p: 0.5 + - class_path: torchvision.transforms.v2.RandomZoomOut + init_args: + fill: 0 + - class_path: otx.core.data.transform_libs.torchvision.RandomIoUCrop + init_args: + p: 0.8 + - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes + init_args: + min_size: 1 + - class_path: otx.core.data.transform_libs.torchvision.RandomFlip + init_args: + prob: 0.5 + - class_path: otx.core.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + transform_bbox: true + keep_ratio: false + is_numpy_to_tvtensor: true + - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion + enable: false + init_args: + is_numpy_to_tvtensor: true + - class_path: otx.core.data.transform_libs.torchvision.RandomAffine + enable: false + init_args: + is_numpy_to_tvtensor: true + - class_path: torchvision.transforms.v2.GaussianBlur + enable: false + init_args: + kernel_size: 5 + - class_path: torchvision.transforms.v2.RandomVerticalFlip + enable: false + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: true + - class_path: torchvision.transforms.v2.GaussianNoise + enable: false + - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes + init_args: + min_size: 1 + sampler: + class_path: otx.algo.samplers.balanced_sampler.BalancedSampler + + val_subset: + batch_size: 8 + to_tv_image: true + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + is_numpy_to_tvtensor: true + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: true + + test_subset: + batch_size: 8 + to_tv_image: true + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + is_numpy_to_tvtensor: true + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: true diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py index 1e670930cf0..fdbb7c73f00 100644 --- a/src/otx/tools/converter.py +++ b/src/otx/tools/converter.py @@ -97,6 +97,10 @@ "task": OTXTaskType.DETECTION, "model_name": "rtmdet_tiny", }, + "Object_Detection_DFine_X": { + "task": OTXTaskType.DETECTION, + "model_name": "dfine_x", + }, # INSTANCE_SEGMENTATION "Custom_Counting_Instance_Segmentation_MaskRCNN_ResNet50": { "task": OTXTaskType.INSTANCE_SEGMENTATION, diff --git a/src/otx/tools/templates/detection/detection/dfine_x/template.yaml b/src/otx/tools/templates/detection/detection/dfine_x/template.yaml new file mode 100644 index 00000000000..459395bdd7f --- /dev/null +++ b/src/otx/tools/templates/detection/detection/dfine_x/template.yaml @@ -0,0 +1,46 @@ +# Description. +model_template_id: Object_Detection_DFine_X +name: DFine-X +task_type: DETECTION +task_family: VISION +instantiation: "CLASS" +summary: Class-Incremental Object Detection for DFine-X +application: ~ + +# Algo backend. +framework: OTXDetection v2.9.1 + +# Capabilities. +capabilities: + - compute_representations + +# Hyperparameters. +hyper_parameters: + base_path: ../configuration.yaml + parameter_overrides: + learning_parameters: + batch_size: + default_value: 8 + auto_hpo_state: POSSIBLE + inference_batch_size: + default_value: 8 + learning_rate: + default_value: 0.00025 + auto_hpo_state: POSSIBLE + num_iters: + default_value: 200 + +# Training resources. +max_nodes: 1 +training_targets: + - GPU + - CPU + +# Stats. +gigaflops: 202.486 +size: 240.0 +# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform. +# inference_targets: +# - CPU +# - GPU +# - VPU diff --git a/tests/integration/api/test_xai.py b/tests/integration/api/test_xai.py index 8a565233f4a..5408cd5049e 100644 --- a/tests/integration/api/test_xai.py +++ b/tests/integration/api/test_xai.py @@ -51,6 +51,11 @@ def test_forward_explain( # TODO(Eugene): maskdino not support yet. pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.") + if "dfine" in model_name: + # TODO(Eugene): dfine not support yet. + # https://jira.devtools.intel.com/browse/CVS-160781 + pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.") + if "dino" in model_name: pytest.skip("DINO is not supported.") @@ -118,6 +123,11 @@ def test_predict_with_explain( # TODO(Eugene): maskdino not support yet. pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.") + if "dfine" in model_name: + # TODO(Eugene): dfine not support yet. + # https://jira.devtools.intel.com/browse/CVS-160781 + pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.") + if "rtmdet_tiny" in recipe: # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651) pytest.skip("rtmdet_tiny on detection is not supported yet.") diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py index 649bbf6a4c1..0de2a490929 100644 --- a/tests/integration/cli/test_cli.py +++ b/tests/integration/cli/test_cli.py @@ -252,6 +252,9 @@ def test_otx_e2e( if "dino" in model_name: return # DINO is not supported. + if "dfine" in model_name: + return # DFine is not supported. + if "rtdetr" in model_name: return # RT-DETR currently is not supported. @@ -331,6 +334,9 @@ def test_otx_explain_e2e( if "dino" in model_name: pytest.skip("DINO is not supported.") + if "dfine" in model_name: + pytest.skip("DFine is not supported.") + if "maskrcnn_r50_tv" in model_name: pytest.skip("MaskRCNN R50 Torchvision model doesn't support explain.") elif "rtdetr" in recipe: diff --git a/tests/unit/algo/detection/test_dfine.py b/tests/unit/algo/detection/test_dfine.py new file mode 100644 index 00000000000..2d025849842 --- /dev/null +++ b/tests/unit/algo/detection/test_dfine.py @@ -0,0 +1,158 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of D-Fine.""" + +from unittest.mock import MagicMock + +import pytest +import torch +import torchvision +from otx.algo.detection.backbones.hgnetv2 import HGNetv2 +from otx.algo.detection.d_fine import DFine +from otx.algo.detection.heads.dfine_decoder import DFINETransformer +from otx.algo.detection.losses.dfine_loss import DFINECriterion +from otx.algo.detection.necks.dfine_hybrid_encoder import HybridEncoder +from otx.algo.detection.rtdetr import DETR +from otx.core.data.entity.detection import DetBatchPredEntity + + +class TestDFine: + @pytest.mark.parametrize( + "model", + [ + DFine(label_info=3, model_name="dfine_hgnetv2_x"), + ], + ) + def test_loss(self, model, fxt_data_module): + data = next(iter(fxt_data_module.train_dataloader())) + data.images = torch.randn([2, 3, 640, 640]) + model(data) + + @pytest.mark.parametrize( + "model", + [ + DFine(label_info=3, model_name="dfine_hgnetv2_x"), + ], + ) + def test_predict(self, model, fxt_data_module): + data = next(iter(fxt_data_module.train_dataloader())) + data.images = torch.randn(2, 3, 640, 640) + model.eval() + output = model(data) + assert isinstance(output, DetBatchPredEntity) + + @pytest.mark.parametrize( + "model", + [ + DFine(label_info=3, model_name="dfine_hgnetv2_x"), + ], + ) + def test_export(self, model): + model.eval() + output = model.forward_for_tracing(torch.randn(1, 3, 640, 640)) + assert len(output) == 3 + + @pytest.fixture() + def dfine_model(self): + num_classes = 10 + model_name = "dfine_hgnetv2_x" + backbone = HGNetv2(model_name=model_name) + encoder = HybridEncoder(model_name=model_name) + decoder = DFINETransformer( + model_name=model_name, + num_classes=num_classes, + ) + criterion = DFINECriterion( + weight_dict={ + "loss_vfl": 1, + "loss_bbox": 5, + "loss_giou": 2, + "loss_fgl": 0.15, + "loss_ddf": 1.5, + }, + alpha=0.75, + gamma=2.0, + reg_max=32, + num_classes=num_classes, + ) + return DETR(backbone=backbone, encoder=encoder, decoder=decoder, num_classes=10, criterion=criterion) + + @pytest.fixture() + def targets(self): + return [ + { + "boxes": torch.tensor([[0.2739, 0.2848, 0.3239, 0.3348], [0.1652, 0.1109, 0.2152, 0.1609]]), + "labels": torch.tensor([2, 2]), + }, + { + "boxes": torch.tensor( + [ + [0.6761, 0.8174, 0.7261, 0.8674], + [0.1652, 0.1109, 0.2152, 0.1609], + [0.2848, 0.9370, 0.3348, 0.9870], + ], + ), + "labels": torch.tensor([8, 2, 7]), + }, + ] + + @pytest.fixture() + def images(self): + return torch.randn(2, 3, 640, 640) + + def test_dfine_forward(self, dfine_model, images, targets): + dfine_model.train() + output = dfine_model(images, targets) + assert isinstance(output, dict) + for key in output: + assert key.startswith("loss_") + assert "loss_bbox" in output + assert "loss_vfl" in output + assert "loss_giou" in output + + def test_dfine_postprocess(self, dfine_model): + outputs = { + "pred_logits": torch.randn(2, 100, 10), + "pred_boxes": torch.randn(2, 100, 4), + } + original_sizes = [[640, 640], [640, 640]] + result = dfine_model.postprocess(outputs, original_sizes) + assert isinstance(result, tuple) + assert len(result) == 3 + scores, boxes, labels = result + assert isinstance(scores, list) + assert isinstance(boxes, list) + assert isinstance(boxes[0], torchvision.tv_tensors.BoundingBoxes) + assert boxes[0].canvas_size == original_sizes[0] + assert isinstance(labels, list) + assert len(scores) == 2 + assert len(boxes) == 2 + assert len(labels) == 2 + + def test_dfine_export(self, dfine_model, images): + dfine_model.eval() + dfine_model.num_top_queries = 10 + batch_img_metas = [{"img_shape": (740, 740), "scale_factor": 1.0}] + result = dfine_model.export(images, batch_img_metas) + assert isinstance(result, dict) + assert "bboxes" in result + assert "labels" in result + assert "scores" in result + assert result["bboxes"].shape == (2, 10, 4) + # ensure no scaling + assert torch.all(result["bboxes"] < 2) + + def test_set_input_size(self): + input_size = 1280 + model = DETR( + backbone=MagicMock(), + encoder=MagicMock(), + decoder=MagicMock(), + num_classes=10, + input_size=input_size, + ) + + expected_multi_scale = sorted([input_size - i * 32 for i in range(-5, 6)] + [input_size] * 2) + + assert sorted(model.multi_scale) == expected_multi_scale diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py index ba232509dd7..b5c791f70b2 100644 --- a/tests/unit/core/data/test_tiling.py +++ b/tests/unit/core/data/test_tiling.py @@ -124,6 +124,7 @@ def fxt_data_config(self, fxt_data_roots) -> dict[dict]: }, } + @pytest.mark.intense() def det_dummy_forward(self, x: DetBatchDataEntity) -> DetBatchPredEntity: """Dummy detection forward function for testing. @@ -178,6 +179,7 @@ def det_dummy_forward(self, x: DetBatchDataEntity) -> DetBatchPredEntity: return pred_entity + @pytest.mark.intense() def inst_seg_dummy_forward(self, x: InstanceSegBatchDataEntity) -> InstanceSegBatchPredEntity: """Dummy instance segmantation forward function for testing. @@ -240,6 +242,7 @@ def inst_seg_dummy_forward(self, x: InstanceSegBatchDataEntity) -> InstanceSegBa return pred_entity + @pytest.mark.intense() @pytest.mark.parametrize( "task", [OTXTaskType.DETECTION, OTXTaskType.INSTANCE_SEGMENTATION, OTXTaskType.SEMANTIC_SEGMENTATION], @@ -381,6 +384,7 @@ def test_tile_sampler(self, fxt_data_config): assert sampled_count == count, "Sampled count should be equal to the count of the dataloader batch size" + @pytest.mark.intense() def test_train_dataloader(self, fxt_data_config) -> None: for task, data_config in fxt_data_config.items(): # Enable tile adapter @@ -400,6 +404,7 @@ def test_train_dataloader(self, fxt_data_config) -> None: else: pytest.skip("Task not supported") + @pytest.mark.intense() def test_val_dataloader(self, fxt_data_config) -> None: for task, data_config in fxt_data_config.items(): # Enable tile adapter @@ -419,6 +424,7 @@ def test_val_dataloader(self, fxt_data_config) -> None: else: pytest.skip("Task not supported") + @pytest.mark.intense() def test_det_tile_merge(self, fxt_data_config): data_config = fxt_data_config[OTXTaskType.DETECTION] model = ATSS( @@ -427,6 +433,8 @@ def test_det_tile_merge(self, fxt_data_config): ) # updated from OTXDetectionModel to avoid NotImplementedError in _build_model # Enable tile adapter data_config["tile_config"] = TileConfig(enable_tiler=True) + data_config["mem_cache_size"] = "0" + data_config["val_subset"].batch_size = 1 tile_datamodule = OTXDataModule( task=OTXTaskType.DETECTION, **data_config, @@ -439,6 +447,7 @@ def test_det_tile_merge(self, fxt_data_config): for batch in tile_datamodule.val_dataloader(): model.forward_tiles(batch) + @pytest.mark.intense() def test_explain_det_tile_merge(self, fxt_data_config): data_config = fxt_data_config[OTXTaskType.DETECTION] model = ATSS( @@ -447,6 +456,8 @@ def test_explain_det_tile_merge(self, fxt_data_config): ) # updated from OTXDetectionModel to avoid NotImplementedError in _build_model # Enable tile adapter data_config["tile_config"] = TileConfig(enable_tiler=True, enable_adaptive_tiling=False) + data_config["mem_cache_size"] = "0" + data_config["val_subset"].batch_size = 1 tile_datamodule = OTXDataModule( task=OTXTaskType.DETECTION, **data_config, @@ -461,6 +472,7 @@ def test_explain_det_tile_merge(self, fxt_data_config): assert prediction.saliency_map[0].ndim == 3 self.explain_mode = False + @pytest.mark.intense() def test_instseg_tile_merge(self, fxt_data_config): data_config = fxt_data_config[OTXTaskType.INSTANCE_SEGMENTATION] model = MaskRCNN(label_info=3, model_name="maskrcnn_efficientnet_b2b", input_size=(256, 256)) @@ -480,6 +492,7 @@ def test_instseg_tile_merge(self, fxt_data_config): for batch in tile_datamodule.val_dataloader(): model.forward_tiles(batch) + @pytest.mark.intense() def test_explain_instseg_tile_merge(self, fxt_data_config): data_config = fxt_data_config[OTXTaskType.INSTANCE_SEGMENTATION] model = MaskRCNN(label_info=3, model_name="maskrcnn_efficientnet_b2b", input_size=(256, 256)) @@ -501,11 +514,14 @@ def test_explain_instseg_tile_merge(self, fxt_data_config): assert prediction.saliency_map[0].ndim == 3 self.explain_mode = False + @pytest.mark.intense() def test_seg_tile_merge(self, fxt_data_config): data_config = fxt_data_config[OTXTaskType.SEMANTIC_SEGMENTATION] model = LiteHRNet(label_info=3, model_name="lite_hrnet_18") # Enable tile adapter data_config["tile_config"] = TileConfig(enable_tiler=True) + data_config["mem_cache_size"] = "0" + data_config["val_subset"].batch_size = 1 tile_datamodule = OTXDataModule( task=OTXTaskType.SEMANTIC_SEGMENTATION, **data_config, @@ -517,6 +533,7 @@ def test_seg_tile_merge(self, fxt_data_config): for batch in tile_datamodule.val_dataloader(): model.forward_tiles(batch) + @pytest.mark.intense() def test_seg_tiler(self, mocker): rng = np.random.default_rng() rnd_tile_size = rng.integers(low=100, high=500) diff --git a/tox.ini b/tox.ini index a4d8d7ac0db..e8b1431fb37 100644 --- a/tox.ini +++ b/tox.ini @@ -48,13 +48,19 @@ deps = .[base,dev] commands = ; Run Unit-Test with coverage report. - pytest tests/unit \ + pytest -m "not intense" tests/unit \ --cov=otx \ --cov-report=xml:{toxworkdir}/coverage_{envname}.xml \ --cov-report=term-missing \ --cov-fail-under=0 \ {posargs} +[testenv:intense-unit-test-{py310, py311}] +deps = + .[base,dev] +commands = + pytest -m "intense" tests/unit {posargs} + [testenv:integration-test-{all, action, classification, multi_cls_classification, multi_label_classification, hlabel_classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting_all, visual_prompting, zero_shot_visual_prompting, anomaly, anomaly_classification, anomaly_detection, anomaly_segmentation, object_detection_3d}] setenv = From e4b234fe64dca001db57dd39c3f19a839473a238 Mon Sep 17 00:00:00 2001 From: Alexander Barabanov <97449232+AlexanderBarabanov@users.noreply.github.com> Date: Fri, 17 Jan 2025 14:25:30 +0000 Subject: [PATCH 3/9] Bump download-artifact (#4179) bump download-artifact --- .github/workflows/perf_benchmark.yaml | 2 +- .github/workflows/publish.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml index f5297c1a640..4de6b98e289 100644 --- a/.github/workflows/perf_benchmark.yaml +++ b/.github/workflows/perf_benchmark.yaml @@ -172,7 +172,7 @@ jobs: python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt rm /tmp/requirements.txt - name: Download benchmark results - uses: actions/download-artifact@87c55149d96e628cc2ef7e6fc2aab372015aec85 # v4.1.3 + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: path: tests/perf/history/latest - name: Summarize benchamrk results diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 5adc23c3714..11dc49ebdb2 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -49,7 +49,7 @@ jobs: id-token: write steps: - name: Download artifacts - uses: actions/download-artifact@87c55149d96e628cc2ef7e6fc2aab372015aec85 # v4.1.3 + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: path: dist pattern: artifact-* From c3ce3618869f6fdaf78c34055e6fa7d9ff4c6f82 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 09:32:30 +0100 Subject: [PATCH 4/9] Bump diffusers from 0.32.1 to 0.32.2 in the pip-others group (#4178) Bumps the pip-others group with 1 update: [diffusers](https://github.com/huggingface/diffusers). Updates `diffusers` from 0.32.1 to 0.32.2 - [Release notes](https://github.com/huggingface/diffusers/releases) - [Commits](https://github.com/huggingface/diffusers/compare/v0.32.1...v0.32.2) --- updated-dependencies: - dependency-name: diffusers dependency-type: direct:production update-type: version-update:semver-patch dependency-group: pip-others ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Vladislav Sovrasov --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fa4b942ff1b..48ce9f40767 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,7 +90,7 @@ base = [ transformers = [ "transformers==4.48.0", - "diffusers==0.32.1", + "diffusers==0.32.2", "torchmetrics[image]>=0.7.0" ] From 7ca826ad859dea910a8822ae15e2292e48257be6 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Mon, 20 Jan 2025 10:24:42 +0100 Subject: [PATCH 5/9] Update keypoint metric name in benchmark (#4180) --- tests/perf/test_keypoint_detection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/perf/test_keypoint_detection.py b/tests/perf/test_keypoint_detection.py index 1ff150a03d6..8786f530445 100644 --- a/tests/perf/test_keypoint_detection.py +++ b/tests/perf/test_keypoint_detection.py @@ -47,10 +47,10 @@ class TestPerfKeypointDetection(PerfTestBase): BENCHMARK_CRITERIA = [ # noqa: RUF012 Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), - Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="val/PCK", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/PCK", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/PCK", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/PCK", summary="max", compare=">", margin=0.1), Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), @@ -120,10 +120,10 @@ class TestPerfKeypointDetectionSingleObj(PerfTestBase): BENCHMARK_CRITERIA = [ # noqa: RUF012 Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), - Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="val/PCK", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/PCK", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/PCK", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/PCK", summary="max", compare=">", margin=0.1), Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), From 4416ac42019e6a7e596e4d02a2289b55fd8f0316 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Jan 2025 16:40:46 +0100 Subject: [PATCH 6/9] Bump the pip-others group across 1 directory with 4 updates (#4185) Bumps the pip-others group with 4 updates in the / directory: [numba](https://github.com/numba/numba), [tox](https://github.com/tox-dev/tox), [pre-commit](https://github.com/pre-commit/pre-commit) and [transformers](https://github.com/huggingface/transformers). Updates `numba` from 0.60.0 to 0.61.0 - [Release notes](https://github.com/numba/numba/releases) - [Commits](https://github.com/numba/numba/compare/0.60.0...0.61.0) Updates `tox` from 4.23.2 to 4.24.1 - [Release notes](https://github.com/tox-dev/tox/releases) - [Changelog](https://github.com/tox-dev/tox/blob/main/docs/changelog.rst) - [Commits](https://github.com/tox-dev/tox/compare/4.23.2...4.24.1) Updates `pre-commit` from 4.0.1 to 4.1.0 - [Release notes](https://github.com/pre-commit/pre-commit/releases) - [Changelog](https://github.com/pre-commit/pre-commit/blob/main/CHANGELOG.md) - [Commits](https://github.com/pre-commit/pre-commit/compare/v4.0.1...v4.1.0) Updates `transformers` from 4.48.0 to 4.48.1 - [Release notes](https://github.com/huggingface/transformers/releases) - [Commits](https://github.com/huggingface/transformers/compare/v4.48.0...v4.48.1) --- updated-dependencies: - dependency-name: numba dependency-type: direct:production update-type: version-update:semver-minor dependency-group: pip-others - dependency-name: tox dependency-type: direct:production update-type: version-update:semver-minor dependency-group: pip-others - dependency-name: pre-commit dependency-type: direct:production update-type: version-update:semver-minor dependency-group: pip-others - dependency-name: transformers dependency-type: direct:production update-type: version-update:semver-patch dependency-group: pip-others ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 48ce9f40767..567dfbce811 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,15 +39,15 @@ dependencies = [ "einops==0.8.0", "decord==0.6.0", "typeguard>=4.3,<4.5", - "numba==0.60.0", + "numba==0.61.0", # TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126 "setuptools<70", ] [project.optional-dependencies] dev = [ - "tox==4.23.2", - "pre-commit==4.0.1", + "tox==4.24.1", + "pre-commit==4.1.0", "pylint", "pytest", "coverage", @@ -89,7 +89,7 @@ base = [ ] transformers = [ - "transformers==4.48.0", + "transformers==4.48.1", "diffusers==0.32.2", "torchmetrics[image]>=0.7.0" ] @@ -106,7 +106,7 @@ mmlab = [ ] ci_tox = [ - "tox==4.23.2", + "tox==4.24.1", ] ci_publish = [ From 2f0b54b4c01a13cfaf442270883445f9a1c111a9 Mon Sep 17 00:00:00 2001 From: Eugene Liu Date: Thu, 23 Jan 2025 08:36:14 +0000 Subject: [PATCH 7/9] DETR XAI (#4184) * Implement explainability features in DFine and RTDETR models --- CHANGELOG.md | 2 + docs/source/guide/tutorials/base/explain.rst | 4 +- src/otx/algo/detection/d_fine.py | 56 ++++++++++++++++++ .../detectors/detection_transformer.py | 40 ++++++++++--- src/otx/algo/detection/heads/dfine_decoder.py | 31 ++++++++-- .../algo/detection/heads/rtdetr_decoder.py | 35 ++++++++--- src/otx/algo/detection/rtdetr.py | 58 ++++++++++++++++++- tests/integration/api/test_xai.py | 18 +----- tests/integration/cli/test_cli.py | 13 +---- 9 files changed, 205 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a5dbbb26b6..e0578a1549b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ All notable changes to this project will be documented in this file. () - Add D-Fine Detection Algorithm () +- Add DETR XAI Explain Mode + () ### Enhancements diff --git a/docs/source/guide/tutorials/base/explain.rst b/docs/source/guide/tutorials/base/explain.rst index bf2af135783..cb195b9a914 100644 --- a/docs/source/guide/tutorials/base/explain.rst +++ b/docs/source/guide/tutorials/base/explain.rst @@ -32,6 +32,7 @@ which are heatmaps with red-colored areas indicating focus. Here's an example ho (otx) ...$ otx explain --work_dir otx-workspace \ --dump True # Wherether to save saliency map images or not + --explain_config.postprocess True # Resizes and applies colormap to the saliency map .. tab-item:: CLI (with config) @@ -41,6 +42,7 @@ which are heatmaps with red-colored areas indicating focus. Here's an example ho --data_root data/wgisd \ --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt \ --dump True # Wherether to save saliency map images or not + --explain_config.postprocess True # Resizes and applies colormap to the saliency map .. tab-item:: API @@ -49,7 +51,7 @@ which are heatmaps with red-colored areas indicating focus. Here's an example ho engine.explain( checkpoint="", datamodule=OTXDataModule(...), # The data module to use for predictions - explain_config=ExplainConfig(postprocess=True), + explain_config=ExplainConfig(postprocess=True), # Resizes and applies colormap to the saliency map dump=True # Wherether to save saliency map images or not ) diff --git a/src/otx/algo/detection/d_fine.py b/src/otx/algo/detection/d_fine.py index 5e16aa9c3c7..717ea9d6b23 100644 --- a/src/otx/algo/detection/d_fine.py +++ b/src/otx/algo/detection/d_fine.py @@ -157,6 +157,9 @@ def _customize_inputs( ) targets.append({"boxes": scaled_bboxes, "labels": ll}) + if self.explain_mode: + return {"entity": entity} + return { "images": entity.images, "targets": targets, @@ -185,6 +188,33 @@ def _customize_outputs( original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info] scores, bboxes, labels = self.model.postprocess(outputs, original_sizes) + if self.explain_mode: + if not isinstance(outputs, dict): + msg = f"Model output should be a dict, but got {type(outputs)}." + raise ValueError(msg) + + if "feature_vector" not in outputs: + msg = "No feature vector in the model output." + raise ValueError(msg) + + if "saliency_map" not in outputs: + msg = "No saliency maps in the model output." + raise ValueError(msg) + + saliency_map = outputs["saliency_map"].detach().cpu().numpy() + feature_vector = outputs["feature_vector"].detach().cpu().numpy() + + return DetBatchPredEntity( + batch_size=len(outputs), + images=inputs.images, + imgs_info=inputs.imgs_info, + scores=scores, + bboxes=bboxes, + labels=labels, + feature_vector=feature_vector, + saliency_map=saliency_map, + ) + return DetBatchPredEntity( batch_size=len(outputs), images=inputs.images, @@ -306,3 +336,29 @@ def _optimization_config(self) -> dict[str, Any]: }, }, } + + @staticmethod + def _forward_explain_detection( + self, # noqa: ANN001 + entity: DetBatchDataEntity, + mode: str = "tensor", # noqa: ARG004 + ) -> dict[str, torch.Tensor]: + """Forward function for explainable detection model.""" + backbone_feats = self.encoder(self.backbone(entity.images)) + predictions = self.decoder(backbone_feats, explain_mode=True) + + raw_logits = DETR.split_and_reshape_logits( + backbone_feats, + predictions["raw_logits"], + ) + + saliency_map = self.explain_fn(raw_logits) + feature_vector = self.feature_vector_fn(backbone_feats) + predictions.update( + { + "feature_vector": feature_vector, + "saliency_map": saliency_map, + }, + ) + + return predictions diff --git a/src/otx/algo/detection/detectors/detection_transformer.py b/src/otx/algo/detection/detectors/detection_transformer.py index d6798f1d426..f3cda5b7417 100644 --- a/src/otx/algo/detection/detectors/detection_transformer.py +++ b/src/otx/algo/detection/detectors/detection_transformer.py @@ -1,11 +1,10 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # """Base DETR model implementations.""" from __future__ import annotations -import warnings from typing import Any import numpy as np @@ -96,22 +95,47 @@ def export( explain_mode: bool = False, ) -> dict[str, Any] | tuple[list[Any], list[Any], list[Any]]: """Exports the model.""" + backbone_feats = self.encoder(self.backbone(batch_inputs)) + predictions = self.decoder(backbone_feats, explain_mode=True) results = self.postprocess( - self._forward_features(batch_inputs), + predictions, [meta["img_shape"] for meta in batch_img_metas], deploy_mode=True, ) - if explain_mode: - # TODO(Eugene): Implement explain mode for DETR model. - warnings.warn("Explain mode is not supported for DETR model. Return dummy values.", stacklevel=2) + raw_logits = self.split_and_reshape_logits(backbone_feats, predictions["raw_logits"]) + feature_vector = self.feature_vector_fn(backbone_feats) + saliency_map = self.explain_fn(raw_logits) xai_output = { - "feature_vector": torch.zeros(1, 1), - "saliency_map": torch.zeros(1), + "feature_vector": feature_vector, + "saliency_map": saliency_map, } results.update(xai_output) # type: ignore[union-attr] return results + @staticmethod + def split_and_reshape_logits( + backbone_feats: tuple[Tensor, ...], + raw_logits: Tensor, + ) -> tuple[Tensor, ...]: + """Splits and reshapes raw logits for explain mode. + + Args: + backbone_feats (tuple[Tensor,...]): Tuple of backbone features. + raw_logits (Tensor): Raw logits. + + Returns: + tuple[Tensor,...]: The reshaped logits. + """ + splits = [f.shape[-2] * f.shape[-1] for f in backbone_feats] + # Permute and split logits in one line + raw_logits = torch.split(raw_logits.permute(0, 2, 1), splits, dim=-1) + + # Reshape each split in a list comprehension + return tuple( + logits.reshape(f.shape[0], -1, f.shape[-2], f.shape[-1]) for logits, f in zip(raw_logits, backbone_feats) + ) + def postprocess( self, outputs: dict[str, Tensor], diff --git a/src/otx/algo/detection/heads/dfine_decoder.py b/src/otx/algo/detection/heads/dfine_decoder.py index d28e0cf3864..e2d8f9dd663 100644 --- a/src/otx/algo/detection/heads/dfine_decoder.py +++ b/src/otx/algo/detection/heads/dfine_decoder.py @@ -723,7 +723,7 @@ def _get_decoder_input( enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1) content = torch.concat([denoising_logits, content], dim=1) - return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list + return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list, enc_outputs_logits def _select_topk( self, @@ -762,8 +762,22 @@ def _select_topk( return topk_memory, topk_logits, topk_anchors - def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None) -> dict[str, Tensor]: - """Forward pass of the DFine Transformer module.""" + def forward( + self, + feats: Tensor, + targets: list[dict[str, Tensor]] | None = None, + explain_mode: bool = False, + ) -> dict[str, Tensor]: + """Forward function of the D-FINE Decoder Transformer Module. + + Args: + feats (Tensor): Feature maps. + targets (list[dict[str, Tensor]] | None, optional): target annotations. Defaults to None. + explain_mode (bool, optional): Whether to return raw logits for explanation. Defaults to False. + + Returns: + dict[str, Tensor]: Output dictionary containing predicted logits, losses and boxes. + """ # input projection and embedding memory, spatial_shapes = self._get_encoder_input(feats) @@ -781,7 +795,13 @@ def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None) else: denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None - init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = self._get_decoder_input( + ( + init_ref_contents, + init_ref_points_unact, + enc_topk_bboxes_list, + enc_topk_logits_list, + raw_logits, + ) = self._get_decoder_input( memory, spatial_shapes, denoising_logits, @@ -858,6 +878,9 @@ def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None) "pred_boxes": out_bboxes[-1], } + if explain_mode: + out["raw_logits"] = raw_logits + return out @torch.jit.unused diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py index dd5cf2f1991..bf140675ef7 100644 --- a/src/otx/algo/detection/heads/rtdetr_decoder.py +++ b/src/otx/algo/detection/heads/rtdetr_decoder.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # """RTDETR decoder, modified from https://github.com/lyuwenyu/RT-DETR.""" @@ -546,10 +546,10 @@ def _get_decoder_input( output_memory = self.enc_output(memory) - enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_logits = self.enc_score_head(output_memory) enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors - _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1) + _, topk_ind = torch.topk(enc_outputs_logits.max(-1).values, self.num_queries, dim=1) reference_points_unact = enc_outputs_coord_unact.gather( dim=1, @@ -560,9 +560,9 @@ def _get_decoder_input( if denoising_bbox_unact is not None: reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1) - enc_topk_logits = enc_outputs_class.gather( + enc_topk_logits = enc_outputs_logits.gather( dim=1, - index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]), + index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_logits.shape[-1]), ) # extract region features @@ -575,10 +575,24 @@ def _get_decoder_input( if denoising_class is not None: target = torch.concat([denoising_class, target], 1) - return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits + return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits, enc_outputs_logits - def forward(self, feats: torch.Tensor, targets: list[dict[str, torch.Tensor]] | None = None) -> torch.Tensor: - """Forward pass of the RTDETRTransformer module.""" + def forward( + self, + feats: torch.Tensor, + targets: list[dict[str, torch.Tensor]] | None = None, + explain_mode: bool = False, + ) -> dict[str, torch.Tensor]: + """Forward function of RTDETRTransformer. + + Args: + feats (Tensor): Input features. + targets (List[Dict[str, Tensor]]): List of target dictionaries. + explain_mode (bool): Whether to return raw logits for explanation. + + Returns: + dict[str, Tensor]: Output dictionary containing predicted logits, losses and boxes. + """ # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) @@ -596,7 +610,7 @@ def forward(self, feats: torch.Tensor, targets: list[dict[str, torch.Tensor]] | else: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None - target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = self._get_decoder_input( + target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits, raw_logits = self._get_decoder_input( memory, spatial_shapes, denoising_class, @@ -630,6 +644,9 @@ def forward(self, feats: torch.Tensor, targets: list[dict[str, torch.Tensor]] | out["dn_aux_outputs"] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) out["dn_meta"] = dn_meta + if explain_mode: + out["raw_logits"] = raw_logits + return out @torch.jit.unused diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 87784dadd7a..fcbf6330c2e 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # """RTDetr model implementations.""" @@ -128,6 +128,9 @@ def _customize_inputs( ) targets.append({"boxes": scaled_bboxes, "labels": ll}) + if self.explain_mode: + return {"entity": entity} + return { "images": entity.images, "targets": targets, @@ -156,6 +159,33 @@ def _customize_outputs( original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info] scores, bboxes, labels = self.model.postprocess(outputs, original_sizes) + if self.explain_mode: + if not isinstance(outputs, dict): + msg = f"Model output should be a dict, but got {type(outputs)}." + raise ValueError(msg) + + if "feature_vector" not in outputs: + msg = "No feature vector in the model output." + raise ValueError(msg) + + if "saliency_map" not in outputs: + msg = "No saliency maps in the model output." + raise ValueError(msg) + + saliency_map = outputs["saliency_map"].detach().cpu().numpy() + feature_vector = outputs["feature_vector"].detach().cpu().numpy() + + return DetBatchPredEntity( + batch_size=len(outputs), + images=inputs.images, + imgs_info=inputs.imgs_info, + scores=scores, + bboxes=bboxes, + labels=labels, + feature_vector=feature_vector, + saliency_map=saliency_map, + ) + return DetBatchPredEntity( batch_size=len(outputs), images=inputs.images, @@ -271,3 +301,29 @@ def _exporter(self) -> OTXModelExporter: def _optimization_config(self) -> dict[str, Any]: """PTQ config for RT-DETR.""" return {"model_type": "transformer"} + + @staticmethod + def _forward_explain_detection( + self, # noqa: ANN001 + entity: DetBatchDataEntity, + mode: str = "tensor", # noqa: ARG004 + ) -> dict[str, torch.Tensor]: + """Forward function for explainable detection model.""" + backbone_feats = self.encoder(self.backbone(entity.images)) + predictions = self.decoder(backbone_feats, explain_mode=True) + + raw_logits = DETR.split_and_reshape_logits( + backbone_feats, + predictions["raw_logits"], + ) + + saliency_map = self.explain_fn(raw_logits) + feature_vector = self.feature_vector_fn(backbone_feats) + predictions.update( + { + "feature_vector": feature_vector, + "saliency_map": saliency_map, + }, + ) + + return predictions diff --git a/tests/integration/api/test_xai.py b/tests/integration/api/test_xai.py index 5408cd5049e..d82723470ec 100644 --- a/tests/integration/api/test_xai.py +++ b/tests/integration/api/test_xai.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from pathlib import Path @@ -51,11 +51,6 @@ def test_forward_explain( # TODO(Eugene): maskdino not support yet. pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.") - if "dfine" in model_name: - # TODO(Eugene): dfine not support yet. - # https://jira.devtools.intel.com/browse/CVS-160781 - pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.") - if "dino" in model_name: pytest.skip("DINO is not supported.") @@ -63,9 +58,6 @@ def test_forward_explain( # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651) pytest.skip("rtmdet_tiny on detection is not supported yet.") - if "rtdetr" in recipe: - pytest.skip("rtdetr on detection is not supported yet.") - if "yolov9" in recipe: pytest.skip("yolov9 on detection is not supported yet.") @@ -123,11 +115,6 @@ def test_predict_with_explain( # TODO(Eugene): maskdino not support yet. pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.") - if "dfine" in model_name: - # TODO(Eugene): dfine not support yet. - # https://jira.devtools.intel.com/browse/CVS-160781 - pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.") - if "rtmdet_tiny" in recipe: # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651) pytest.skip("rtmdet_tiny on detection is not supported yet.") @@ -136,9 +123,6 @@ def test_predict_with_explain( # TODO (Galina): required to update model-api to 2.1 pytest.skip("yolox_tiny_tile on detection requires model-api update") - if "rtdetr" in recipe: - pytest.skip("rtdetr on detection is not supported yet.") - if "yolov9" in recipe: pytest.skip("yolov9 on detection is not supported yet.") diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py index 0de2a490929..3c11993ddab 100644 --- a/tests/integration/cli/test_cli.py +++ b/tests/integration/cli/test_cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -252,12 +252,6 @@ def test_otx_e2e( if "dino" in model_name: return # DINO is not supported. - if "dfine" in model_name: - return # DFine is not supported. - - if "rtdetr" in model_name: - return # RT-DETR currently is not supported. - if "yolov9" in model_name: return # RT-DETR currently is not supported. @@ -334,13 +328,8 @@ def test_otx_explain_e2e( if "dino" in model_name: pytest.skip("DINO is not supported.") - if "dfine" in model_name: - pytest.skip("DFine is not supported.") - if "maskrcnn_r50_tv" in model_name: pytest.skip("MaskRCNN R50 Torchvision model doesn't support explain.") - elif "rtdetr" in recipe: - pytest.skip("rtdetr model is not supported yet with explain.") elif "keypoint" in recipe: pytest.skip("keypoint detection models don't support explain.") elif "yolov9" in recipe: From 24b0b9f2a9837e3d80dd58edfcf833f6f4cd7f52 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:22:10 +0100 Subject: [PATCH 8/9] Bump mlflow from 2.19.0 to 2.20.0 in the pip-others group (#4186) Bumps the pip-others group with 1 update: [mlflow](https://github.com/mlflow/mlflow). Updates `mlflow` from 2.19.0 to 2.20.0 - [Release notes](https://github.com/mlflow/mlflow/releases) - [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md) - [Commits](https://github.com/mlflow/mlflow/compare/v2.19.0...v2.20.0) --- updated-dependencies: - dependency-name: mlflow dependency-type: direct:production update-type: version-update:semver-minor dependency-group: pip-others ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 567dfbce811..8c2438730f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,7 +120,7 @@ ci_benchmark = [ "ipython==8.31.0", "ipykernel==6.29.5", "openpyxl==3.1.5", - "mlflow==2.19.0", + "mlflow==2.20.0", "py-cpuinfo==9.0.0", ] From 4c99b6f37c5bc6569038e6b5681c511ebb95095b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:53:38 +0100 Subject: [PATCH 9/9] Bump nbconvert from 7.16.5 to 7.16.6 in the pip-others group (#4190) Bumps the pip-others group with 1 update: [nbconvert](https://github.com/jupyter/nbconvert). Updates `nbconvert` from 7.16.5 to 7.16.6 - [Release notes](https://github.com/jupyter/nbconvert/releases) - [Changelog](https://github.com/jupyter/nbconvert/blob/main/CHANGELOG.md) - [Commits](https://github.com/jupyter/nbconvert/compare/v7.16.5...v7.16.6) --- updated-dependencies: - dependency-name: nbconvert dependency-type: direct:production update-type: version-update:semver-patch dependency-group: pip-others ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8c2438730f1..3fa45493763 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,7 +116,7 @@ ci_publish = [ ci_benchmark = [ "pandas<2.3", # To avoid conflict with nncf==2.9.0 "matplotlib==3.10.0", - "nbconvert==7.16.5", + "nbconvert==7.16.6", "ipython==8.31.0", "ipykernel==6.29.5", "openpyxl==3.1.5",