From a6d5795d10e8feb558939629154159bd358c251a Mon Sep 17 00:00:00 2001
From: Alexander Barabanov
 <97449232+AlexanderBarabanov@users.noreply.github.com>
Date: Thu, 16 Jan 2025 12:43:09 +0000
Subject: [PATCH 1/9] Security scan pipeline update (#4177)

* codeql settings update

* codeql fix

* update trivy

* update bandit

* trivy fix

* trivy fix

* trivy fix

* json output

* trivy fix

* trivy spdx

* codeql added

* bandit update

* remove bandit B320

* remove bandit B410

* remove workflow_dispatch

* revert trivy yaml

* fix format
---
 .ci/ipas_default.config          |   6 +-
 .github/workflows/code_scan.yaml | 134 +++++++++++++++++++++++++++----
 .github/workflows/codeql.yaml    |  37 +++++----
 tox.ini                          |   2 +-
 4 files changed, 143 insertions(+), 36 deletions(-)

diff --git a/.ci/ipas_default.config b/.ci/ipas_default.config
index 4bc8b481e39..95dd511397c 100644
--- a/.ci/ipas_default.config
+++ b/.ci/ipas_default.config
@@ -40,7 +40,7 @@
 # B317 : xml_bad_sax
 # B318 : xml_bad_minidom
 # B319 : xml_bad_pulldom
-# B320 : xml_bad_etree
+# B320 : xml_bad_etree - removed https://github.com/PyCQA/bandit/commit/e4da0b351f89a82b5de8dd791cbdd963476b5a11
 # B321 : ftplib
 # B323 : unverified_context
 # B324 : hashlib_new_insecure_functions
@@ -53,7 +53,7 @@
 # B407 : import_xml_expat
 # B408 : import_xml_minidom
 # B409 : import_xml_pulldom
-# B410 : import_lxml
+# B410 : import_lxml - removed https://github.com/PyCQA/bandit/commit/e4da0b351f89a82b5de8dd791cbdd963476b5a11
 # B411 : import_xmlrpclib
 # B412 : import_httpoxy
 # B413 : import_pycrypto
@@ -83,7 +83,7 @@
 # IPAS Required Checkers. Do not disable these
 # Additional checkers may be added if desired
 tests:
-  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413']
+  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B411', 'B412', 'B413']
 
 # (optional) list skipped test IDs here, eg '[B101, B406]':
 # The following checkers are not required but be added to tests list if desired
diff --git a/.github/workflows/code_scan.yaml b/.github/workflows/code_scan.yaml
index ad66b1d55a2..85d0d8abb1d 100644
--- a/.github/workflows/code_scan.yaml
+++ b/.github/workflows/code_scan.yaml
@@ -10,12 +10,11 @@ on:
     # every UTC 6PM from Mon to Fri
     - cron: "0 18 * * 1-5"
 
-# Declare default permissions as read only.
-permissions: read-all
+permissions: {}
 
 jobs:
-  Trivy-scan:
-    runs-on: ubuntu-latest
+  Trivy:
+    runs-on: ubuntu-22.04
     steps:
       - name: Checkout code
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -27,27 +26,52 @@ jobs:
         run: python -m pip install --require-hashes --no-deps -r .ci/requirements.txt
       - name: Freeze dependencies
         run: pip-compile --extra=docs,base,mmlab,anomaly -o requirements.txt pyproject.toml
-      - name: Trivy Scanning (spdx.json)
-        uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0
+
+      - name: Run Trivy Scan (vuln)
+        uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # v0.29.0
         with:
-          trivy-config: ".ci/trivy-json.yaml"
-          scan-type: "fs"
+          scan-type: fs
+          scan-ref: requirements.txt
+          scanners: vuln
+          output: trivy-results-vuln.txt
+
+      - name: Run Trivy Scan (dockerfile and secrets)
+        uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # v0.29.0
+        with:
+          scan-type: fs
           scan-ref: .
-      - name: Trivy Scanning
+          scanners: misconfig,secret
+          output: trivy-results-misconfig.txt
+          skip-setup-trivy: true
+
+      - name: Trivy Scanning (spdx)
         uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # 0.29.0
         with:
-          trivy-config: ".ci/trivy.yaml"
-          scan-type: "fs"
+          scan-type: fs
           scan-ref: .
+          format: spdx-json
+          output: trivy-results-spdx.json
+          skip-setup-trivy: true
+
       - name: Upload Trivy results artifact
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: trivy-results
-          path: "${{ github.workspace }}/trivy-results.*"
+          path: "${{ github.workspace }}/trivy-results-*"
+          retention-days: 7
         # Use always() to always run this step to publish scan results when there are test failures
         if: ${{ always() }}
+
+      - name: Upload deps list
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        if: always()
+        with:
+          name: python-deps-list
+          path: "${{ github.workspace }}/requirements.txt"
+          retention-days: 7
+
   Bandit:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Checkout repository
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -66,7 +90,87 @@ jobs:
       - name: Upload Bandit artifact
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
-          name: bandit-report
-          path: .tox/bandit-report.txt
+          name: bandit-results
+          path: .tox/bandit-results.txt
+          retention-days: 7
         # Use always() to always run this step to publish scan results when there are test failures
         if: ${{ always() }}
+
+  CodeQL:
+    name: Analyze (${{ matrix.language }})
+    runs-on: ubuntu-22.04
+    permissions:
+      # required for all workflows
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - language: python
+            build-mode: none
+          - language: actions # to scan workflows
+            build-mode: none
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
+        with:
+          languages: ${{ matrix.language }}
+          build-mode: ${{ matrix.build-mode }}
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
+        with:
+          category: "/language:${{matrix.language}}"
+
+      - name: Generate CodeQL Report
+        uses: rsdmike/github-security-report-action@a149b24539044c92786ec39af8ba38c93496495d # v3.0.4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          template: report
+          outputDir: codeql-${{ matrix.language }}
+
+      - name: Rename Report
+        shell: bash
+        continue-on-error: true
+        run: |
+          cd codeql-${{ matrix.language }}
+          mv "report.pdf" "codeql-${{ matrix.language }}.pdf"
+
+      - name: Upload Report
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        with:
+          name: codeql-${{ matrix.language }}-results
+          path: codeql-${{ matrix.language }}/*.pdf
+          retention-days: 7
+
+  Summarize:
+    needs: [Trivy, Bandit, CodeQL]
+    if: always()
+    runs-on: ubuntu-22.04
+    steps:
+      # Create directory first
+      - name: Create results directory
+        run: mkdir -p all-results
+
+      # Download artifacts with error handling
+      - name: Download all results
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        continue-on-error: true # Don't fail if some tools didn't generate results
+        with:
+          pattern: "*-results"
+          merge-multiple: true
+          path: all-results
+
+      # Only upload if there are files
+      - name: Upload combined results
+        if: hashFiles('all-results/**/*') != ''
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        with:
+          name: security-scan-results
+          path: all-results
+          retention-days: 7
diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml
index 27a904f9445..49e78c1ac6d 100644
--- a/.github/workflows/codeql.yaml
+++ b/.github/workflows/codeql.yaml
@@ -12,20 +12,11 @@
 name: "CodeQL"
 
 on:
-  push:
-    branches:
-      - develop
-      - releases/**
   pull_request:
     types:
       - opened
       - reopened
       - synchronize
-  schedule:
-    - cron: "0 0 * * 0"
-
-permissions:
-  contents: read
 
 jobs:
   analyze:
@@ -35,20 +26,20 @@ jobs:
     #   - https://gh.io/supported-runners-and-hardware-resources
     #   - https://gh.io/using-larger-runners
     # Consider using larger runners for possible analysis time improvements.
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     timeout-minutes: 60
     permissions:
       # required for all workflows
       security-events: write
 
-      # only required for workflows in private repositories
-      actions: read
-      contents: read
-
     strategy:
       fail-fast: false
       matrix:
-        language: ["python"]
+        include:
+          - language: python
+            build-mode: none
+          - language: actions # to scan workflows
+            build-mode: none
         # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ]
         # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both
         # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
@@ -63,6 +54,7 @@ jobs:
         uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9
         with:
           languages: ${{ matrix.language }}
+          build-mode: ${{ matrix.build-mode }}
           # If you wish to specify custom queries, you can do so here or in a config file.
           # By default, queries listed here will override any specified in a config file.
           # Prefix the list here with "+" to use these queries and those in the config file.
@@ -74,13 +66,24 @@ jobs:
         uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9
         with:
           category: "/language:${{matrix.language}}"
+
       - name: Generate Security Report
         uses: rsdmike/github-security-report-action@a149b24539044c92786ec39af8ba38c93496495d # v3.0.4
         with:
           template: report
           token: ${{ secrets.GITHUB_TOKEN }}
+          outputDir: codeql-${{ matrix.language }}
+
+      - name: Rename Report
+        shell: bash
+        continue-on-error: true
+        run: |
+          cd codeql-${{ matrix.language }}
+          mv "report.pdf" "codeql-${{ matrix.language }}.pdf"
+
       - name: GitHub Upload Release Artifacts
         uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
-          name: codeql-report
-          path: "./report.pdf"
+          name: codeql-${{ matrix.language }}-results
+          path: codeql-${{ matrix.language }}/*.pdf
+          retention-days: 7
diff --git a/tox.ini b/tox.ini
index 7aa0fa1ad5b..a4d8d7ac0db 100644
--- a/tox.ini
+++ b/tox.ini
@@ -99,7 +99,7 @@ deps =
 allowlist_externals =
     bandit
 commands =
-    - bandit -r -c .ci/ipas_default.config {toxinidir}/ -f txt -o {toxworkdir}/bandit-report.txt
+    - bandit -r -c .ci/ipas_default.config -f txt -o {toxworkdir}/bandit-results.txt .
 
 
 [testenv:fuzzing]

From d663fd724f3c4ceacb186a5b8a7cb55136c6aacb Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Fri, 17 Jan 2025 13:26:08 +0000
Subject: [PATCH 2/9] OTX D-Fine Detection Algorithm Integration (#4142)

* init

* remove convertbox

* Refactor D-FINE detector: remove unused components and update model configuration

* update

* update

* Update

* update recipes

* Add d-fine-m

* Fix recipes

* dfine-l

* Add dfine m - no aug

* format changes

* learnable params + disable teacher distillation

* update

* add recipes

* update

* update

* update recipes

* add dfine_hgnetv2_x

* Update recipes

* add tile DFine recipes

* update recipes and tile batch size

* update

* update LR

* DFine revert LR changes

* make multi-scale optional

* update tile recipes

* update tiling recipes

* add backbone pretrained weights

* updawte

* update

* loss

* update

* Update

* refactor d-fine criterion

* * Fix docstring punctuation and remove unused aux_loss parameter in DFINETransformerModule
* Refactor DFineCriterion

* Update style changes

* conv batchnorm fuse

* update hybrid encoder

* Refactor DFINE HybridEncoderModule to improve code clarity and remove redundant parameters

* minor update

* Refactor D-FINE module structure by removing obsolete detector file and reorganizing imports

* Refactor import paths in D-FINE module and clean up unused code

* Refactor D-FINE module by removing commented code, cleaning up imports, and updating documentation

* Refactor D-FINE module by updating type hints, improving error messages, and enhancing documentation for RandomIoUCrop

* Refactor D-FINE module by improving the weighting function's return structure and updating type hints in DFINECriterion

* Update d-fine unit test

* Refactor D-FINE module by enhancing docstrings for clarity and updating parameter names for consistency

* Add D-Fine Detection Algorithm entries to CHANGELOG and object detection documentation

* Fix device assignment for positional embeddings in HybridEncoderModule

* Refactor D-FINE module by removing unused functions and integrating dfine_bbox2distance in DFINECriterion

* Update codeowners

* Add advanced parameters to optimization config in DFine model

* Remove DFINE M, S, N model configuration files

* disable tiling mem cache

* Update codeowners

* revert codeowner changes

* Remove unused DFINE model configurations from unit tests

* Add heavy unit test workflow and mark tests accordingly

* Add container configuration for Heavy-Unit-Test job in pre_merge.yaml

* Add additional transformations to D-Fine configuration and update test skips for unsupported models

* Reduce batch size and remove heavy markers from unit tests in test_tiling.py

* Revert "Add additional transformations to D-Fine configuration and update test skips for unsupported models"

This reverts commit d5c66f54c0eecf3f1b038c0c347eadddb7021c1f.

* Revert "Reduce batch size and remove heavy markers from unit tests in test_tiling.py"

This reverts commit 563e0331a99b3792c0eca6fa28f15ffd07e394d8.

* Add additional transformations to D-Fine configuration in YAML files

* disable pytest heavy tag

* update

* Remove unused DFine-L model configurations and update unit tests

* Add DFine-X model template for class-incremental object detection

* Update docs/source/guide/explanation/algorithms/object_detection/object_detection.rst

Co-authored-by: Samet Akcay <samet.akcay@intel.com>

* Update copyright years from 2024 to 2025 in multiple files

* Rename heavy unit tests to intense unit tests and update related configurations

* Update container image in pre_merge.yaml for Intense-Unit-Test job

* update pre-merge

* update ubuntu container image

* update container image

* Add new object detection model configuration for DFine HGNetV2 X

* update image

* Update pre-merge workflow to use Ubuntu 24.04 and simplify unit test coverage reporting

* install sqlite

* Remove sudo from apt-get command in pre-merge workflow

* Remove sudo from apt-get command in pre-merge workflow

* Update pre-merge workflow to install additional dependencies and correct model name in converter

* Update detection configuration: increase warmup steps and patience, add min_lr, and remove unused callbacks

* Remove D-Fine model recipes from object detection documentation

* Skip tests for unsupported models: add check for D-Fine

* Skip tests for unsupported models: add check for D-Fine

* Skip tests for unsupported models: add check for DFine

* Refactor DFine model: remove unused checkpoint loading and update optimizer configuration documentation; change reg_scale to float in DFINETransformer.

---------

Co-authored-by: Samet Akcay <samet.akcay@intel.com>
---
 .github/workflows/pre_merge.yaml              |  32 +
 CHANGELOG.md                                  |   2 +
 .../object_detection/object_detection.rst     |   2 +
 pyproject.toml                                |   3 +-
 .../algo/common/layers/transformer_layers.py  | 148 ++-
 src/otx/algo/detection/backbones/hgnetv2.py   | 640 ++++++++++++
 src/otx/algo/detection/d_fine.py              | 308 ++++++
 src/otx/algo/detection/heads/__init__.py      |  13 +-
 src/otx/algo/detection/heads/dfine_decoder.py | 935 ++++++++++++++++++
 src/otx/algo/detection/layers/csp_layer.py    |  10 +-
 src/otx/algo/detection/losses/__init__.py     |   4 +-
 src/otx/algo/detection/losses/dfine_loss.py   | 501 ++++++++++
 .../detection/necks/dfine_hybrid_encoder.py   | 438 ++++++++
 src/otx/algo/detection/utils/utils.py         | 165 +++-
 .../core/data/transform_libs/torchvision.py   |  43 +-
 src/otx/recipe/detection/dfine_x.yaml         | 129 +++
 src/otx/recipe/detection/dfine_x_tile.yaml    | 125 +++
 src/otx/tools/converter.py                    |   4 +
 .../detection/detection/dfine_x/template.yaml |  46 +
 tests/integration/api/test_xai.py             |  10 +
 tests/integration/cli/test_cli.py             |   6 +
 tests/unit/algo/detection/test_dfine.py       | 158 +++
 tests/unit/core/data/test_tiling.py           |  17 +
 tox.ini                                       |   8 +-
 24 files changed, 3736 insertions(+), 11 deletions(-)
 create mode 100644 src/otx/algo/detection/backbones/hgnetv2.py
 create mode 100644 src/otx/algo/detection/d_fine.py
 create mode 100644 src/otx/algo/detection/heads/dfine_decoder.py
 create mode 100644 src/otx/algo/detection/losses/dfine_loss.py
 create mode 100644 src/otx/algo/detection/necks/dfine_hybrid_encoder.py
 create mode 100644 src/otx/recipe/detection/dfine_x.yaml
 create mode 100644 src/otx/recipe/detection/dfine_x_tile.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/dfine_x/template.yaml
 create mode 100644 tests/unit/algo/detection/test_dfine.py

diff --git a/.github/workflows/pre_merge.yaml b/.github/workflows/pre_merge.yaml
index 201aaf089e3..07e1f67b44f 100644
--- a/.github/workflows/pre_merge.yaml
+++ b/.github/workflows/pre_merge.yaml
@@ -84,6 +84,38 @@ jobs:
           curl -Os https://uploader.codecov.io/latest/linux/codecov
           chmod +x codecov
           ./codecov -t ${{ secrets.CODECOV_TOKEN }} --sha $COMMIT_ID -U $HTTP_PROXY -f .tox/coverage_unit-test-${{ matrix.tox-env }}.xml -F ${{ matrix.tox-env }}
+  Intense-Unit-Test:
+    runs-on: [otx-gpu-a10g-1]
+    container:
+      image: "ubuntu:24.04"
+    needs: Code-Quality-Checks
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python-version: "3.10"
+            tox-env: "py310"
+          - python-version: "3.11"
+            tox-env: "py311"
+    name: Intense-Unit-Test-with-Python${{ matrix.python-version }}
+    steps:
+      - name: Install dependencies
+        run: apt-get update && apt-get install -y libsqlite3-0 libsqlite3-dev libgl1 libglib2.0-0
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Install Python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install tox
+        run: |
+          python -m pip install --require-hashes --no-deps -r .ci/requirements.txt
+          pip-compile --generate-hashes --output-file=/tmp/requirements.txt --extra=ci_tox pyproject.toml
+          python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt
+          rm /tmp/requirements.txt
+      - name: Run unit test
+        run: tox -vv -e intense-unit-test-${{ matrix.tox-env }}
   Integration-Test:
     if: |
       github.event.pull_request.draft == false &&
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bb157675bba..1a5dbbb26b6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3979>)
 - Add OpenVINO inference for 3D Object Detection task
   (<https://github.com/openvinotoolkit/training_extensions/pull/4017>)
+- Add D-Fine Detection Algorithm
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4142>)
 
 ### Enhancements
 
diff --git a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst
index 3dd3fbc0349..925e4f119f1 100644
--- a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst
+++ b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst
@@ -73,6 +73,8 @@ We support the following ready-to-use model recipes:
 +------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 | `Object_Detection_ResNeXt101_ATSS <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/recipe/detection/atss_resnext101.yaml>`_    |   ResNeXt101-ATSS   | 434.75              | 344.0           |
 +------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
+| `D-Fine X Detection <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/recipe/detection/dfine_x.yaml>`                           |   D-Fine X          | 202.486             | 240.0           |
++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 
 Above table can be found using the following command
 
diff --git a/pyproject.toml b/pyproject.toml
index 56c71ef9ec5..fa4b942ff1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -398,6 +398,7 @@ convention = "google"
 markers = [
     "gpu",  # mark tests which require NVIDIA GPU
     "cpu",
-    "xpu",  # mark tests which require Intel dGPU
+    "xpu",  # mark tests which require Intel dGPU,
+    "intense", # intense unit tests which require better CI machines
 ]
 python_files = "tests/**/*.py"
diff --git a/src/otx/algo/common/layers/transformer_layers.py b/src/otx/algo/common/layers/transformer_layers.py
index 20ae281ecad..532f314128c 100644
--- a/src/otx/algo/common/layers/transformer_layers.py
+++ b/src/otx/algo/common/layers/transformer_layers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """Implementation of common transformer layers."""
@@ -10,6 +10,7 @@
 from typing import Callable
 
 import torch
+import torch.nn.functional as f
 from otx.algo.common.utils.utils import get_clones
 from otx.algo.modules.transformer import deformable_attention_core_func
 from torch import Tensor, nn
@@ -306,6 +307,151 @@ def forward(
         return self.output_proj(output)
 
 
+class MSDeformableAttentionV2(nn.Module):
+    """Multi-Scale Deformable Attention Module V2.
+
+    Note:
+        This is different from vanilla MSDeformableAttention where it uses
+        distinct number of sampling points for features at different scales.
+        Refer to RTDETRv2.
+
+    Args:
+        embed_dim (int): The number of expected features in the input.
+        num_heads (int): The number of heads in the multiheadattention models.
+        num_levels (int): The number of levels in MSDeformableAttention.
+        num_points_list (list[int]): Number of distinct points for each layer. Defaults to [3, 6, 3].
+    """
+
+    def __init__(
+        self,
+        embed_dim: int = 256,
+        num_heads: int = 8,
+        num_levels: int = 4,
+        num_points_list: list[int] = [3, 6, 3],  # noqa: B006
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points_list = num_points_list
+
+        num_points_scale = [1 / n for n in num_points_list for _ in range(n)]
+        self.register_buffer(
+            "num_points_scale",
+            torch.tensor(num_points_scale, dtype=torch.float32),
+        )
+
+        self.total_points = num_heads * sum(num_points_list)
+        self.head_dim = embed_dim // num_heads
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self) -> None:
+        """Reset parameters of the model."""
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values  # noqa: PD011
+        grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1])
+        scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+    def forward(
+        self,
+        query: Tensor,
+        reference_points: Tensor,
+        value: Tensor,
+        value_spatial_shapes: list[list[int]],
+    ) -> Tensor:
+        """Forward function of MSDeformableAttention.
+
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, len_q = query.shape[:2]
+        _, n_head, c, _ = value[0].shape
+        num_points_list = self.num_points_list
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            bs,
+            len_q,
+            self.num_heads,
+            sum(self.num_points_list),
+            2,
+        )
+
+        attention_weights = self.attention_weights(query).reshape(
+            bs,
+            len_q,
+            self.num_heads,
+            sum(self.num_points_list),
+        )
+        attention_weights = f.softmax(attention_weights, dim=-1)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = (
+                reference_points.reshape(
+                    bs,
+                    len_q,
+                    1,
+                    self.num_levels,
+                    1,
+                    2,
+                )
+                + sampling_offsets / offset_normalizer
+            )
+        elif reference_points.shape[-1] == 4:
+            num_points_scale = self.num_points_scale.to(query).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * 0.5
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            msg = (f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead.",)
+            raise ValueError(msg)
+
+        # sampling_offsets [8, 480, 8, 12, 2]
+        sampling_grids = 2 * sampling_locations - 1
+
+        sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        sampling_locations_list = sampling_grids.split(num_points_list, dim=-2)
+
+        sampling_value_list = []
+        for level, (h, w) in enumerate(value_spatial_shapes):
+            value_l = value[level].reshape(bs * n_head, c, h, w)
+            sampling_grid_l = sampling_locations_list[level]
+            sampling_value_l = f.grid_sample(
+                value_l,
+                sampling_grid_l,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+
+            sampling_value_list.append(sampling_value_l)
+
+        attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(bs * n_head, 1, len_q, sum(num_points_list))
+        weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
+        output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, len_q)
+
+        return output.permute(0, 2, 1)
+
+
 class VisualEncoderLayer(nn.Module):
     """VisualEncoderLayer module consisting of MSDeformableAttention and feed-forward network.
 
diff --git a/src/otx/algo/detection/backbones/hgnetv2.py b/src/otx/algo/detection/backbones/hgnetv2.py
new file mode 100644
index 00000000000..65fd8408d7f
--- /dev/null
+++ b/src/otx/algo/detection/backbones/hgnetv2.py
@@ -0,0 +1,640 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""High Performance GPU Net(HGNet) Backbone from PaddlePaddle.
+
+Modified from:
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+    https://github.com/Peterande/D-FINE
+"""
+
+from __future__ import annotations
+
+from typing import Any, ClassVar
+
+import torch
+import torch.nn.functional as f
+from torch import Tensor, nn
+
+from otx.algo.modules.norm import FrozenBatchNorm2d
+
+# Constants for initialization
+kaiming_normal_ = nn.init.kaiming_normal_
+zeros_ = nn.init.zeros_
+ones_ = nn.init.ones_
+
+
+class LearnableAffineBlock(nn.Module):
+    """Learnable affine block.
+
+    Args:
+        scale_value (float, optional): scale. Defaults to 1.0.
+        bias_value (float, optional): bias. Defaults to 0.0.
+    """
+
+    def __init__(
+        self,
+        scale_value: float = 1.0,
+        bias_value: float = 0.0,
+    ):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): input tensor.
+
+        Returns:
+            Tensor: output tensor.
+        """
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Module):
+    """Convolutional block with batch normalization and activation.
+
+        TODO(Eugene): External LAB is embedded. 'Try'? switching to OTX ConvModule implementation in next PR.
+
+    Args:
+        in_channels (int): In channels.
+        out_channels (int): Out Channels.
+        kernel_size (int): convolution kernel size.
+        stride (int, optional): stride. Defaults to 1.
+        groups (int, optional): number of conv groups. Defaults to 1.
+        use_act (bool, optional): Use ReLU activation. Defaults to True.
+        use_lab (bool, optional): Use learnable affine block. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        use_act: bool = True,
+        use_lab: bool = False,
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        if self.use_act:
+            self.act = nn.ReLU()
+        else:
+            self.act = nn.Identity()
+        if self.use_act and self.use_lab:
+            self.lab = LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): input tensor.
+
+        Returns:
+            Tensor: output tensor.
+        """
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return self.lab(x)
+
+
+class LightConvBNAct(nn.Module):
+    """Lightweight convolutional block with batch normalization and activation.
+
+    Args:
+    in_chs (int): In channels.
+    out_chs (int): Out channels.
+    kernel_size (int): convolution kernel size.
+    use_lab (bool, optional): Use Learnable Affine Block. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: int,
+        kernel_size: int,
+        use_lab: bool = False,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_chs,
+            out_chs,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+        )
+        self.conv2 = ConvBNAct(
+            out_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            groups=out_chs,
+            use_act=True,
+            use_lab=use_lab,
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): input tensor.
+
+        Returns:
+            Tensor: output tensor.
+        """
+        x = self.conv1(x)
+        return self.conv2(x)
+
+
+class HGNetv2StemBlock(nn.Module):
+    """HGNetV2 stem block.
+
+    Args:
+        in_chs (int): In channels.
+        mid_chs (int): Mid channels.
+        out_chs (int): Out channels.
+        use_lab (bool, optional): Use Learnable Affine Block. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        in_chs: int,
+        mid_chs: int,
+        out_chs: int,
+        use_lab: bool = False,
+    ):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_chs,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem2a = ConvBNAct(
+            mid_chs,
+            mid_chs // 2,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem2b = ConvBNAct(
+            mid_chs // 2,
+            mid_chs,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem3 = ConvBNAct(
+            mid_chs * 2,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem4 = ConvBNAct(
+            mid_chs,
+            out_chs,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): input tensor.
+
+        Returns:
+            Tensor: output tensor.
+        """
+        x = self.stem1(x)
+        x = f.pad(x, (0, 1, 0, 1))
+        x2 = self.stem2a(x)
+        x2 = f.pad(x2, (0, 1, 0, 1))
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.stem3(x)
+        return self.stem4(x)
+
+
+class HGBlock(nn.Module):
+    """HGNetV2 block.
+
+    Args:
+        in_chs (int): In channels.
+        mid_chs (int): Mid channels.
+        out_chs (int): Out channels.
+        layer_num (int): Number of convolutional layers.
+        kernel_size (int, optional): kernel size. Defaults to 3.
+        residual (bool, optional): Add residual. Defaults to False.
+        light_block (bool, optional): Use LightConvBNAct layer. Defaults to False.
+        use_lab (bool, optional): User Learnable Affine Block. Defaults to False.
+        drop_path (float, optional): Dropout rate. Defaults to 0.0.
+    """
+
+    def __init__(
+        self,
+        in_chs: int,
+        mid_chs: int,
+        out_chs: int,
+        layer_num: int,
+        kernel_size: int = 3,
+        residual: bool = False,
+        light_block: bool = False,
+        use_lab: bool = False,
+        drop_path: float = 0.0,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            if light_block:
+                self.layers.append(
+                    LightConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        use_lab=use_lab,
+                    ),
+                )
+            else:
+                self.layers.append(
+                    ConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        use_lab=use_lab,
+                    ),
+                )
+
+        # feature aggregation
+        total_chs = in_chs + layer_num * mid_chs
+        aggregation_squeeze_conv = ConvBNAct(
+            total_chs,
+            out_chs // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        aggregation_excitation_conv = ConvBNAct(
+            out_chs // 2,
+            out_chs,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.aggregation = nn.Sequential(
+            aggregation_squeeze_conv,
+            aggregation_excitation_conv,
+        )
+
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): input tensor.
+
+        Returns:
+            Tensor: output tensor.
+        """
+        identity = x
+        output = [x]
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = torch.cat(output, dim=1)
+        x = self.aggregation(x)
+        if self.residual:
+            return self.drop_path(x) + identity
+        return x
+
+
+class HGStage(nn.Module):
+    """HGNetV2 Stage Block.
+
+    Args:
+        in_chs (int): In channels.
+        mid_chs (int): Mid channels.
+        out_chs (int): Out channels.
+        block_num (int): Number of blocks.
+        layer_num (int): Number of convolutional layers.
+        downsample (bool, optional): Downsample. Defaults to True.
+        light_block (bool, optional): Use LightConvBNAct layer. Defaults to False.
+        kernel_size (int, optional): kernel size. Defaults to 3.
+        use_lab (bool, optional): User Learnable Affine Block. Defaults to False.
+        drop_path (float, optional): Dropout rate. Defaults to 0.0.
+    """
+
+    def __init__(
+        self,
+        in_chs: int,
+        mid_chs: int,
+        out_chs: int,
+        block_num: int,
+        layer_num: int,
+        downsample: bool = True,
+        light_block: bool = False,
+        kernel_size: int = 3,
+        use_lab: bool = False,
+        drop_path: float = 0.0,
+    ):
+        super().__init__()
+
+        self.downsample = (
+            ConvBNAct(
+                in_chs,
+                in_chs,
+                kernel_size=3,
+                stride=2,
+                groups=in_chs,
+                use_act=False,
+                use_lab=use_lab,
+            )
+            if downsample
+            else nn.Identity()
+        )
+
+        blocks_list = [
+            HGBlock(
+                out_chs if i > 0 else in_chs,
+                mid_chs,
+                out_chs,
+                layer_num,
+                residual=i > 0,
+                kernel_size=kernel_size,
+                light_block=light_block,
+                use_lab=use_lab,
+                drop_path=drop_path,
+            )
+            for i in range(block_num)
+        ]
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): input tensor.
+
+        Returns:
+            Tensor: output tensor.
+        """
+        x = self.downsample(x)
+        return self.blocks(x)
+
+
+class HGNetv2Module(nn.Module):
+    """HGNetV2 Module.
+
+    Args:
+        name (str): backbone name (i.e. B0, B2, B4, B5).
+        use_lab (bool, optional): User Learnable Affine Block. Defaults to False.
+        return_idx (list[int], optional): Feature Maps. Defaults to [1, 2, 3].
+        freeze_stem_only (bool, optional): Freeze Stem only. Defaults to True.
+        freeze_at (int, optional): Freeze at which stage block. Defaults to 0.
+        freeze_norm (bool, optional): Freeze normalization or not. Defaults to True.
+        pretrained (bool, optional): Use backbone pretrained weight. Defaults to False.
+    """
+
+    arch_configs: ClassVar = {
+        "B0": {
+            "stem_channels": [3, 16, 16],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [16, 16, 64, 1, False, False, 3, 3],
+                "stage2": [64, 32, 256, 1, True, False, 3, 3],
+                "stage3": [256, 64, 512, 2, True, True, 5, 3],
+                "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth",
+        },
+        "B2": {
+            "stem_channels": [3, 24, 32],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 96, 1, False, False, 3, 4],
+                "stage2": [96, 64, 384, 1, True, False, 3, 4],
+                "stage3": [384, 128, 768, 3, True, True, 5, 4],
+                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B2_stage1.pth",
+        },
+        "B4": {
+            "stem_channels": [3, 32, 48],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B4_stage1.pth",
+        },
+        "B5": {
+            "stem_channels": [3, 32, 64],
+            "stage_config": {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+            "url": "https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B5_stage1.pth",
+        },
+    }
+
+    def __init__(
+        self,
+        name: str,
+        use_lab: bool = False,
+        return_idx: tuple = (1, 2, 3),
+        freeze_stem_only: bool = True,
+        freeze_at: int = 0,
+        freeze_norm: bool = True,
+        pretrained: bool = False,
+    ) -> None:
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[name]["stem_channels"]
+        stage_config = self.arch_configs[name]["stage_config"]
+        download_url = self.arch_configs[name]["url"]
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = HGNetv2StemBlock(
+            in_chs=stem_channels[0],
+            mid_chs=stem_channels[1],
+            out_chs=stem_channels[2],
+            use_lab=use_lab,
+        )
+
+        # stages
+        self.stages = nn.ModuleList()
+        for k in stage_config:
+            (
+                in_channels,
+                mid_channels,
+                out_channels,
+                block_num,
+                downsample,
+                light_block,
+                kernel_size,
+                layer_num,
+            ) = stage_config[k]
+            self.stages.append(
+                HGStage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                ),
+            )
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            state = torch.hub.load_state_dict_from_url(
+                download_url,
+                map_location="cpu",
+            )
+            print(f"Loaded stage1 {name} HGNetV2 from URL.")
+            self.load_state_dict(state)
+
+    def _freeze_norm(self, m: nn.Module) -> nn.Module:
+        """Freeze normalization layers.
+
+        Args:
+            m (nn.Module): Normalization module.
+
+        Returns:
+            nn.Module: Freezed normalization module.
+        """
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+    def _freeze_parameters(self, m: nn.Module) -> None:
+        """Freeze module parameters.
+
+        Args:
+            m (nn.Module): Module to freeze.
+        """
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def forward(self, x: Tensor) -> list[Tensor]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor.
+
+        Returns:
+            list[Tensor]: Output tensor.
+        """
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+class HGNetv2:
+    """HGNetV2 backbone."""
+
+    backbone_cfg: ClassVar[dict[str, Any]] = {
+        "dfine_hgnetv2_n": {
+            "name": "B0",
+            "return_idx": [2, 3],
+            "freeze_at": -1,
+            "freeze_norm": False,
+            "use_lab": True,
+            "freeze_stem_only": True,
+            "pretrained": True,
+        },
+        "dfine_hgnetv2_s": {
+            "name": "B0",
+            "return_idx": [1, 2, 3],
+            "freeze_at": -1,
+            "freeze_norm": False,
+            "use_lab": True,
+        },
+        "dfine_hgnetv2_m": {
+            "name": "B2",
+            "return_idx": [1, 2, 3],
+            "freeze_at": -1,
+            "freeze_norm": False,
+            "use_lab": True,
+        },
+        "dfine_hgnetv2_l": {
+            "name": "B4",
+            "return_idx": [1, 2, 3],
+            "freeze_at": 0,
+            "freeze_norm": True,
+            "freeze_stem_only": True,
+        },
+        "dfine_hgnetv2_x": {
+            "name": "B5",
+            "return_idx": [1, 2, 3],
+            "freeze_at": 0,
+            "freeze_norm": True,
+            "freeze_stem_only": True,
+        },
+    }
+
+    def __new__(cls, model_name: str) -> HGNetv2Module:
+        """Create HGNetV2 backbone.
+
+        Args:
+            model_name (str): Model name.
+
+        Returns:
+            HGNetv2Module: HGNetV2 backbone.
+        """
+        return HGNetv2Module(**cls.backbone_cfg[model_name])
diff --git a/src/otx/algo/detection/d_fine.py b/src/otx/algo/detection/d_fine.py
new file mode 100644
index 00000000000..5e16aa9c3c7
--- /dev/null
+++ b/src/otx/algo/detection/d_fine.py
@@ -0,0 +1,308 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""D-Fine model implementations."""
+
+from __future__ import annotations
+
+import copy
+import re
+from typing import TYPE_CHECKING, Any, Literal
+
+import torch
+from torch import Tensor, nn
+from torchvision.ops import box_convert
+from torchvision.tv_tensors import BoundingBoxFormat
+
+from otx.algo.detection.backbones.hgnetv2 import HGNetv2
+from otx.algo.detection.detectors import DETR
+from otx.algo.detection.heads.dfine_decoder import DFINETransformer
+from otx.algo.detection.losses.dfine_loss import DFINECriterion
+from otx.algo.detection.necks.dfine_hybrid_encoder import HybridEncoder
+from otx.core.config.data import TileConfig
+from otx.core.data.entity.base import OTXBatchLossEntity
+from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity
+from otx.core.exporter.base import OTXModelExporter
+from otx.core.exporter.native import OTXNativeModelExporter
+from otx.core.metrics.fmeasure import MeanAveragePrecisionFMeasureCallable
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.model.detection import ExplainableOTXDetModel
+
+if TYPE_CHECKING:
+    from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+
+    from otx.core.metrics import MetricCallable
+    from otx.core.schedulers import LRSchedulerListCallable
+    from otx.core.types.label import LabelInfoTypes
+
+
+PRETRAINED_ROOT: str = "https://github.com/Peterande/storage/releases/download/dfinev1.0/"
+
+PRETRAINED_WEIGHTS: dict[str, str] = {
+    "dfine_hgnetv2_n": PRETRAINED_ROOT + "dfine_n_coco.pth",
+    "dfine_hgnetv2_s": PRETRAINED_ROOT + "dfine_s_coco.pth",
+    "dfine_hgnetv2_m": PRETRAINED_ROOT + "dfine_m_coco.pth",
+    "dfine_hgnetv2_l": PRETRAINED_ROOT + "dfine_l_coco.pth",
+    "dfine_hgnetv2_x": PRETRAINED_ROOT + "dfine_x_coco.pth",
+}
+
+
+class DFine(ExplainableOTXDetModel):
+    """OTX Detection model class for D-Fine."""
+
+    input_size_multiplier = 32
+    mean: tuple[float, float, float] = (0.0, 0.0, 0.0)
+    std: tuple[float, float, float] = (255.0, 255.0, 255.0)
+
+    def __init__(
+        self,
+        model_name: Literal[
+            "dfine_hgnetv2_n",
+            "dfine_hgnetv2_s",
+            "dfine_hgnetv2_m",
+            "dfine_hgnetv2_l",
+            "dfine_hgnetv2_x",
+        ],
+        label_info: LabelInfoTypes,
+        input_size: tuple[int, int] = (640, 640),
+        optimizer: OptimizerCallable = DefaultOptimizerCallable,
+        scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable,
+        metric: MetricCallable = MeanAveragePrecisionFMeasureCallable,
+        multi_scale: bool = False,
+        torch_compile: bool = False,
+        tile_config: TileConfig = TileConfig(enable_tiler=False),
+    ) -> None:
+        self.load_from: str = PRETRAINED_WEIGHTS[model_name]
+        self.multi_scale = multi_scale
+        super().__init__(
+            model_name=model_name,
+            label_info=label_info,
+            input_size=input_size,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            metric=metric,
+            torch_compile=torch_compile,
+            tile_config=tile_config,
+        )
+
+    def _build_model(self, num_classes: int) -> DETR:
+        backbone = HGNetv2(model_name=self.model_name)
+        encoder = HybridEncoder(model_name=self.model_name)
+        decoder = DFINETransformer(
+            model_name=self.model_name,
+            num_classes=num_classes,
+        )
+        criterion = DFINECriterion(
+            weight_dict={
+                "loss_vfl": 1,
+                "loss_bbox": 5,
+                "loss_giou": 2,
+                "loss_fgl": 0.15,
+                "loss_ddf": 1.5,
+            },
+            alpha=0.75,
+            gamma=2.0,
+            reg_max=32,
+            num_classes=num_classes,
+        )
+
+        if self.model_name == "dfine_hgnetv2_n":
+            backbone_lr = 0.0004
+        elif self.model_name == "dfine_hgnetv2_s":
+            backbone_lr = 0.0001
+        elif self.model_name == "dfine_hgnetv2_m":
+            backbone_lr = 0.00002
+        elif self.model_name in ("dfine_hgnetv2_l", "dfine_hgnetv2_x"):
+            backbone_lr = 0.0000125
+        else:
+            msg = f"Unsupported model name: {self.model_name}"
+            raise ValueError(msg)
+
+        optimizer_configuration = [
+            # no weight decay for norm layers in backbone
+            {"params": "^(?=.*backbone)(?=.*norm).*$", "weight_decay": 0.0, "lr": backbone_lr},
+            # lr for the backbone, but not norm layers is 0.00001
+            {"params": "^(?=.*backbone)(?!.*norm).*$", "lr": backbone_lr},
+            # no weight decay for norm layers and biases in encoder and decoder layers
+            {"params": "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$", "weight_decay": 0.0},
+        ]
+
+        return DETR(
+            multi_scale=None if self.multi_scale else [],
+            backbone=backbone,
+            encoder=encoder,
+            decoder=decoder,
+            criterion=criterion,
+            num_classes=num_classes,
+            optimizer_configuration=optimizer_configuration,
+        )
+
+    def _customize_inputs(
+        self,
+        entity: DetBatchDataEntity,
+        pad_size_divisor: int = 32,
+        pad_value: int = 0,
+    ) -> dict[str, Any]:
+        targets: list[dict[str, Any]] = []
+        # prepare bboxes for the model
+        for bb, ll in zip(entity.bboxes, entity.labels):
+            # convert to cxcywh if needed
+            if len(scaled_bboxes := bb):
+                converted_bboxes = (
+                    box_convert(bb, in_fmt="xyxy", out_fmt="cxcywh") if bb.format == BoundingBoxFormat.XYXY else bb
+                )
+                # normalize the bboxes
+                scaled_bboxes = converted_bboxes / torch.tensor(bb.canvas_size[::-1]).tile(2)[None].to(
+                    converted_bboxes.device,
+                )
+            targets.append({"boxes": scaled_bboxes, "labels": ll})
+
+        return {
+            "images": entity.images,
+            "targets": targets,
+        }
+
+    def _customize_outputs(
+        self,
+        outputs: list[torch.Tensor] | dict,  # type: ignore[override]
+        inputs: DetBatchDataEntity,
+    ) -> DetBatchPredEntity | OTXBatchLossEntity:
+        if self.training:
+            if not isinstance(outputs, dict):
+                raise TypeError(outputs)
+
+            losses = OTXBatchLossEntity()
+            for k, v in outputs.items():
+                if isinstance(v, list):
+                    losses[k] = sum(v)
+                elif isinstance(v, Tensor):
+                    losses[k] = v
+                else:
+                    msg = "Loss output should be list or torch.tensor but got {type(v)}"
+                    raise TypeError(msg)
+            return losses
+
+        original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info]
+        scores, bboxes, labels = self.model.postprocess(outputs, original_sizes)
+
+        return DetBatchPredEntity(
+            batch_size=len(outputs),
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            scores=scores,
+            bboxes=bboxes,
+            labels=labels,
+        )
+
+    def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]:
+        """Configure an optimizer and learning-rate schedulers.
+
+        Set up the optimizer and schedulers from the provided inputs.
+        Typically, a warmup scheduler is used initially, followed by the main scheduler.
+
+        Returns:
+            Two list. The former is a list that contains an optimizer
+            The latter is a list of lr scheduler configs which has a dictionary format.
+        """
+        param_groups = self._get_optim_params(self.model.optimizer_configuration, self.model)
+        optimizer = self.optimizer_callable(param_groups)
+        schedulers = self.scheduler_callable(optimizer)
+
+        def ensure_list(item: Any) -> list:  # noqa: ANN401
+            return item if isinstance(item, list) else [item]
+
+        lr_scheduler_configs = []
+        for scheduler in ensure_list(schedulers):
+            lr_scheduler_config = {"scheduler": scheduler}
+            if hasattr(scheduler, "interval"):
+                lr_scheduler_config["interval"] = scheduler.interval
+            if hasattr(scheduler, "monitor"):
+                lr_scheduler_config["monitor"] = scheduler.monitor
+            lr_scheduler_configs.append(lr_scheduler_config)
+
+        return [optimizer], lr_scheduler_configs
+
+    @staticmethod
+    def _get_optim_params(cfg: list[dict[str, Any]] | None, model: nn.Module) -> list[dict[str, Any]]:
+        """Perform no bias decay and learning rate correction for the modules.
+
+        The configuration dict should consist of regular expression pattern for the model parameters with "params" key.
+        Other optimizer parameters can be added as well.
+
+        E.g.:
+            cfg = [{"params": "^((?!b).)*$", "lr": 0.01, "weight_decay": 0.0}, ..]
+            The above configuration is for the parameters that do not contain "b".
+
+            ^(?=.*a)(?=.*b).*$         means including a and b
+            ^((?!b.)*a((?!b).)*$       means including a but not b
+            ^((?!b|c).)*a((?!b|c).)*$  means including a but not (b | c)
+        """
+        if cfg is None:
+            return model.parameters()
+
+        cfg = copy.deepcopy(cfg)
+
+        param_groups = []
+        visited = []
+        for pg in cfg:
+            if "params" not in pg:
+                msg = f"The 'params' key should be included in the configuration, but got {pg.keys()}"
+                raise ValueError(msg)
+            pattern = pg["params"]
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
+            pg["params"] = params.values()
+            param_groups.append(pg)
+            visited.extend(list(params.keys()))
+
+        names = [k for k, v in model.named_parameters() if v.requires_grad]
+
+        if len(visited) < len(names):
+            unseen = set(names) - set(visited)
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
+            param_groups.append({"params": params.values()})
+            visited.extend(list(params.keys()))
+
+        return param_groups
+
+    @property
+    def _exporter(self) -> OTXModelExporter:
+        """Creates OTXModelExporter object that can export the model."""
+        if self.input_size is None:
+            msg = f"Input size attribute is not set for {self.__class__}"
+            raise ValueError(msg)
+
+        return OTXNativeModelExporter(
+            task_level_export_parameters=self._export_parameters,
+            input_size=(1, 3, *self.input_size),
+            mean=self.mean,
+            std=self.std,
+            resize_mode="standard",
+            swap_rgb=False,
+            via_onnx=False,
+            onnx_export_configuration={
+                "input_names": ["images"],
+                "output_names": ["bboxes", "labels", "scores"],
+                "dynamic_axes": {
+                    "images": {0: "batch"},
+                    "boxes": {0: "batch", 1: "num_dets"},
+                    "labels": {0: "batch", 1: "num_dets"},
+                    "scores": {0: "batch", 1: "num_dets"},
+                },
+                "autograd_inlining": False,
+                "opset_version": 16,
+            },
+            output_names=["bboxes", "labels", "scores"],
+        )
+
+    @property
+    def _optimization_config(self) -> dict[str, Any]:
+        """PTQ config for D-FINE."""
+        return {
+            "model_type": "transformer",
+            "advanced_parameters": {
+                "activations_range_estimator_params": {
+                    "min": {"statistics_type": "QUANTILE", "aggregator_type": "MIN", "quantile_outlier_prob": 1e-4},
+                    "max": {"statistics_type": "QUANTILE", "aggregator_type": "MAX", "quantile_outlier_prob": 1e-4},
+                },
+            },
+        }
diff --git a/src/otx/algo/detection/heads/__init__.py b/src/otx/algo/detection/heads/__init__.py
index fd20bfe4808..c38d62b64f8 100644
--- a/src/otx/algo/detection/heads/__init__.py
+++ b/src/otx/algo/detection/heads/__init__.py
@@ -1,12 +1,21 @@
-# Copyright (C) 2023-2024 Intel Corporation
+# Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 """Custom head implementations for detection task."""
 
 from .atss_head import ATSSHead
+from .dfine_decoder import DFINETransformer
 from .rtdetr_decoder import RTDETRTransformer
 from .rtmdet_head import RTMDetSepBNHead
 from .ssd_head import SSDHead
 from .yolo_head import YOLOHead
 from .yolox_head import YOLOXHead
 
-__all__ = ["ATSSHead", "RTDETRTransformer", "RTMDetSepBNHead", "SSDHead", "YOLOHead", "YOLOXHead"]
+__all__ = [
+    "ATSSHead",
+    "DFINETransformer",
+    "RTDETRTransformer",
+    "RTMDetSepBNHead",
+    "SSDHead",
+    "YOLOHead",
+    "YOLOXHead",
+]
diff --git a/src/otx/algo/detection/heads/dfine_decoder.py b/src/otx/algo/detection/heads/dfine_decoder.py
new file mode 100644
index 00000000000..d28e0cf3864
--- /dev/null
+++ b/src/otx/algo/detection/heads/dfine_decoder.py
@@ -0,0 +1,935 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""D-FINE Decoder. Modified from D-FINE (https://github.com/Peterande/D-FINE)."""
+
+from __future__ import annotations
+
+import copy
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, ClassVar
+
+import torch
+import torch.nn.functional as f
+from torch import Tensor, nn
+from torch.nn import init
+
+from otx.algo.common.layers.transformer_layers import MLP, MSDeformableAttentionV2
+from otx.algo.common.utils.utils import inverse_sigmoid
+from otx.algo.detection.heads.rtdetr_decoder import get_contrastive_denoising_training_group
+from otx.algo.detection.utils.utils import dfine_distance2bbox, dfine_weighting_function
+from otx.algo.utils.weight_init import bias_init_with_prob
+
+
+class TransformerDecoderLayer(nn.Module):
+    """Transformer Decoder Layer with MSDeformableAttentionV2.
+
+    Args:
+        d_model (int): The number of expected features in the input. Defaults to 256.
+        n_head (int): The number of heads in the multiheadattention models. Defaults to 8.
+        dim_feedforward (int): The dimension of the feedforward network model. Defaults to 1024.
+        dropout (float): The dropout value. Defaults to 0.0.
+        activation (Callable[..., nn.Module] | None, optional): The activation function. Defaults to None.
+        n_levels (int): The number of levels in MSDeformableAttention. Defaults to 4.
+        num_points_list (list[int], optional): Number of distinct points for each layer. Defaults to [3, 6, 3].
+    """
+
+    def __init__(
+        self,
+        d_model: int = 256,
+        n_head: int = 8,
+        dim_feedforward: int = 1024,
+        dropout: float = 0.0,
+        activation: Callable[..., nn.Module] = partial(nn.ReLU, inplace=True),
+        n_levels: int = 4,
+        num_points_list: list[int] = [3, 6, 3],  # noqa: B006
+    ):
+        super().__init__()
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(
+            d_model,
+            n_head,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttentionV2(
+            d_model,
+            n_head,
+            n_levels,
+            num_points_list,
+        )
+        self.dropout2 = nn.Dropout(dropout)
+
+        # gate
+        self.gateway = Gate(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = activation()
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self) -> None:
+        """Reset parameters of the model."""
+        init.xavier_uniform_(self.linear1.weight)
+        init.xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor: Tensor, pos: Tensor | None) -> Tensor:
+        """Add positional embedding to the input tensor."""
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt: Tensor) -> Tensor:
+        """Forward function of feed forward network."""
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(
+        self,
+        target: Tensor,
+        reference_points: Tensor,
+        value: Tensor,
+        spatial_shapes: list[list[int]],
+        attn_mask: Tensor | None = None,
+        query_pos_embed: Tensor | None = None,
+    ) -> Tensor:
+        """Forward function of the Transformer Decoder Layer.
+
+        Args:
+            target (Tensor): target feature tensor.
+            reference_points (Tensor): reference points tensor.
+            value (Tensor): value tensor.
+            spatial_shapes (list[list[int]]): spatial shapes of the value tensor.
+            attn_mask (Tensor | None, optional): attention mask. Defaults to None.
+            query_pos_embed (Tensor | None, optional): query positional embedding. Defaults to None.
+
+        Returns:
+            Tensor: updated target tensor.
+        """
+        # self attention
+        q = k = self.with_pos_embed(target, query_pos_embed)
+
+        target2, _ = self.self_attn(q, k, value=target, attn_mask=attn_mask)
+        target = target + self.dropout1(target2)
+        target = self.norm1(target)
+
+        # cross attention
+        target2 = self.cross_attn(
+            self.with_pos_embed(target, query_pos_embed),
+            reference_points,
+            value,
+            spatial_shapes,
+        )
+
+        target = self.gateway(target, self.dropout2(target2))
+
+        # ffn
+        target2 = self.forward_ffn(target)
+        target = target + self.dropout4(target2)
+        return self.norm3(target.clamp(min=-65504, max=65504))
+
+
+class Gate(nn.Module):
+    """Target Gating Layers.
+
+    Args:
+        d_model (int): The number of expected features in the input.
+    """
+
+    def __init__(self, d_model: int) -> None:
+        super().__init__()
+        self.gate = nn.Linear(2 * d_model, 2 * d_model)
+        bias = bias_init_with_prob(0.5)
+        init.constant_(self.gate.bias, bias)
+        init.constant_(self.gate.weight, 0)
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        """Forward function of the gate.
+
+        Args:
+            x1 (Tensor): first target input tensor.
+            x2 (Tensor): second target input tensor.
+
+        Returns:
+            Tensor: gated target tensor.
+        """
+        gate_input = torch.cat([x1, x2], dim=-1)
+        gates = torch.sigmoid(self.gate(gate_input))
+        gate1, gate2 = gates.chunk(2, dim=-1)
+        return self.norm(gate1 * x1 + gate2 * x2)
+
+
+class Integral(nn.Module):
+    """A static layer that calculates integral results from a distribution.
+
+    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
+    where Pr(n) is the softmax probability vector representing the discrete
+    distribution, and W(n) is the non-uniform Weighting Function.
+
+    Args:
+        reg_max (int): Max number of the discrete bins. Default is 32.
+                        It can be adjusted based on the dataset or task requirements.
+    """
+
+    def __init__(self, reg_max: int = 32):
+        super().__init__()
+        self.reg_max = reg_max
+
+    def forward(self, x: Tensor, box_distance_weight: Tensor) -> Tensor:
+        """Forward function of the Integral layer."""
+        shape = x.shape
+        x = f.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = f.linear(x, box_distance_weight).reshape(-1, 4)
+        return x.reshape([*list(shape[:-1]), -1])
+
+
+class LQE(nn.Module):
+    """Localization Quality Estimation.
+
+    Args:
+        k (int): number of edge points.
+        hidden_dim (int): The number of expected features in the input.
+        num_layers (int): The number of layers in the MLP.
+        reg_max (int): Max number of the discrete bins.
+    """
+
+    def __init__(
+        self,
+        k: int,
+        hidden_dim: int,
+        num_layers: int,
+        reg_max: int,
+    ):
+        super().__init__()
+        self.k = k
+        self.reg_max = reg_max
+        self.reg_conf = MLP(
+            input_dim=4 * (k + 1),
+            hidden_dim=hidden_dim,
+            output_dim=1,
+            num_layers=num_layers,
+            activation=partial(nn.ReLU, inplace=True),
+        )
+        init.constant_(self.reg_conf.layers[-1].bias, 0)
+        init.constant_(self.reg_conf.layers[-1].weight, 0)
+
+    def forward(self, scores: Tensor, pred_corners: Tensor) -> Tensor:
+        """Forward function of the LQE layer.
+
+        Args:
+            scores (Tensor): Prediction scores.
+            pred_corners (Tensor): Predicted bounding box corners.
+
+        Returns:
+            Tensor: Updated scores.
+        """
+        b, num_pred, _ = pred_corners.size()
+        prob = f.softmax(pred_corners.reshape(b, num_pred, 4, self.reg_max + 1), dim=-1)
+        prob_topk, _ = prob.topk(self.k, dim=-1)
+        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
+        quality_score = self.reg_conf(stat.reshape(b, num_pred, -1))
+        return scores + quality_score
+
+
+class TransformerDecoder(nn.Module):
+    """Transformer Decoder implementing Fine-grained Distribution Refinement (FDR).
+
+    This decoder refines object detection predictions through iterative updates across multiple layers,
+    utilizing attention mechanisms, location quality estimators, and distribution refinement techniques
+    to improve bounding box accuracy and robustness.
+
+    Args:
+        hidden_dim (int): The number of expected features in the input.
+        decoder_layer (nn.Module): The decoder layer module.
+        decoder_layer_wide (nn.Module): The wide decoder layer module.
+        num_layers (int): The number of layers.
+        num_head (int): The number of heads in the multi-head attention models.
+        reg_max (int): The number of discrete bins for bounding box regression.
+        reg_scale (Tensor): The curvature of the Weighting Function.
+        up (Tensor): The upper bound of the sequence.
+        eval_idx (int, optional): evaluation index. Defaults to -1.
+    """
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        decoder_layer: nn.Module,
+        decoder_layer_wide: nn.Module,
+        num_layers: int,
+        num_head: int,
+        reg_max: int,
+        reg_scale: Tensor,
+        up: Tensor,
+        eval_idx: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.num_head = num_head
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+        self.up, self.reg_scale, self.reg_max = up, reg_scale, reg_max
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(decoder_layer) for _ in range(self.eval_idx + 1)]
+            + [copy.deepcopy(decoder_layer_wide) for _ in range(num_layers - self.eval_idx - 1)],
+        )
+        self.lqe_layers = nn.ModuleList([copy.deepcopy(LQE(4, 64, 2, reg_max)) for _ in range(num_layers)])
+        self.box_distance_weight = nn.Parameter(
+            dfine_weighting_function(self.reg_max, self.up, self.reg_scale),
+            requires_grad=False,
+        )
+
+    def value_op(
+        self,
+        memory: Tensor,
+        memory_spatial_shapes: list[list[int]],
+    ) -> tuple[Tensor, ...]:
+        """Preprocess values for MSDeformableAttention."""
+        memory = memory.reshape(memory.shape[0], memory.shape[1], self.num_head, -1)
+        split_shape = [h * w for h, w in memory_spatial_shapes]
+        return memory.permute(0, 2, 3, 1).split(split_shape, dim=-1)
+
+    def forward(
+        self,
+        target: Tensor,
+        ref_points_unact: Tensor,
+        memory: Tensor,
+        spatial_shapes: list[list[int]],
+        bbox_head: nn.Module,
+        score_head: nn.Module,
+        query_pos_head: nn.Module,
+        pre_bbox_head: nn.Module,
+        integral: nn.Module,
+        reg_scale: Tensor,
+        attn_mask: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        """Forward function of the Transformer Decoder.
+
+        Args:
+            target (Tensor): target feature tensor.
+            ref_points_unact (Tensor): reference points tensor.
+            memory (Tensor): memory tensor.
+            spatial_shapes (list[list[int]]): spatial shapes of the memory tensor.
+            bbox_head (nn.Module): bounding box head.
+            score_head (nn.Module): label score head.
+            query_pos_head (nn.Module): query position head.
+            pre_bbox_head (nn.Module): pre-bounding box head.
+            integral (nn.Module): integral module.
+            reg_scale (Tensor): number of discrete bins for bounding box regression.
+            attn_mask (Tensor | None, optional): attention mask tensor. Defaults to None.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+                out_bboxes (Tensor): bounding box predictions from all layers
+                out_logits (Tensor): label score predictions from all layers
+                out_corners (Tensor): bounding box corner predictions from all layers
+                out_refs (Tensor): reference points from all layers
+                pre_bboxes (Tensor): initial bounding box predictions
+                pre_scores (Tensor): initial label score predictions
+        """
+        output = target
+        output_detach = pred_corners_undetach = 0
+        value = self.value_op(memory, spatial_shapes)
+
+        out_bboxes = []
+        out_logits = []
+        out_corners = []
+        out_refs = []
+        box_distance_weight = self.box_distance_weight
+
+        ref_points_detach = f.sigmoid(ref_points_unact)
+
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach).clamp(min=-10, max=10)
+            output = layer(output, ref_points_input, value, spatial_shapes, attn_mask, query_pos_embed)
+
+            if i == 0:
+                # Initial bounding box predictions with inverse sigmoid refinement
+                pre_bboxes = f.sigmoid(pre_bbox_head(output) + inverse_sigmoid(ref_points_detach))
+                pre_scores = score_head[0](output)
+                initial_ref_boxes = pre_bboxes.detach()
+
+            # Refine bounding box corners using FDR, integrating previous layer's corrections
+            pred_corners = bbox_head[i](output + output_detach) + pred_corners_undetach
+            inter_ref_bbox = dfine_distance2bbox(
+                initial_ref_boxes,
+                integral(pred_corners, box_distance_weight),
+                reg_scale,
+            )
+
+            if self.training or i == self.eval_idx:
+                scores = score_head[i](output)
+                # Lqe does not affect the performance here.
+                scores = self.lqe_layers[i](scores, pred_corners)
+                out_logits.append(scores)
+                out_bboxes.append(inter_ref_bbox)
+                out_corners.append(pred_corners)
+                out_refs.append(initial_ref_boxes)
+
+                if not self.training:
+                    break
+
+            pred_corners_undetach = pred_corners
+            ref_points_detach = inter_ref_bbox.detach()
+            output_detach = output.detach()
+
+        return (
+            torch.stack(out_bboxes),  # out_bboxes
+            torch.stack(out_logits),  # out_logits
+            torch.stack(out_corners),  # out_corners
+            torch.stack(out_refs),  # out_refs
+            pre_bboxes,
+            pre_scores,
+        )
+
+
+class DFINETransformerModule(nn.Module):
+    """D-FINE Transformer Module.
+
+    Args:
+        num_classes (int, optional): num of classes. Defaults to 80.
+        hidden_dim (int, optional): Hidden dimension size.. Defaults to 256.
+        num_queries (int, optional): Number of queries. Defaults to 300.
+        feat_channels (list[int], optional): List of feature channels. Defaults to [256, 256, 256].
+        num_points_list (list[int], optional): Number of points for each level. Defaults to [3, 6, 3].
+        num_decoder_layers (int, optional): Number of decoder layers. Defaults to 6.
+        dim_feedforward (int, optional): Dimension of the feedforward network. Defaults to 1024.
+        dropout (float, optional): dropout rate. Defaults to 0.0.
+        activation (Callable[..., nn.Module], optional): activation layer. Defaults to nn.ReLU.
+        num_denoising (int, optional): Number of denoising samples. Defaults to 100.
+        label_noise_ratio (float, optional): Ratio of label noise. Defaults to 0.5.
+        box_noise_scale (float, optional): Scale of box noise. Defaults to 1.0.
+        eval_spatial_size (list[int], optional): Spatial size for evaluation. Defaults to [640, 640].
+        eval_idx (int, optional): Evaluation index. Defaults to -1.
+        reg_scale (float, optional): The weight curvature. Defaults to 4.0.
+        reg_max (int, optional): The number of bins for box regression. Defaults to 32.
+    """
+
+    def __init__(
+        self,
+        num_classes: int = 80,
+        hidden_dim: int = 256,
+        num_queries: int = 300,
+        feat_channels: list[int] = [256, 256, 256],  # noqa: B006
+        feat_strides: list[int] = [8, 16, 32],  # noqa: B006
+        num_levels: int = 3,
+        num_points_list: list[int] = [3, 6, 3],  # noqa: B006
+        nhead: int = 8,
+        num_decoder_layers: int = 6,
+        dim_feedforward: int = 1024,
+        dropout: float = 0.0,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+        num_denoising: int = 100,
+        label_noise_ratio: float = 0.5,
+        box_noise_scale: float = 1.0,
+        eval_spatial_size: list[int] = [640, 640],  # noqa: B006
+        eval_idx: int = -1,
+        reg_scale: float = 4.0,
+        reg_max: int = 32,
+    ):
+        super().__init__()
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = 1e-2
+        self.num_decoder_layers = num_decoder_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.reg_max = reg_max
+
+        # backbone feature projection
+        self._build_input_proj_layer(feat_channels)
+
+        # Transformer module
+        self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False)
+        self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False)
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_levels,
+            num_points_list,
+        )
+        decoder_layer_wide = TransformerDecoderLayer(
+            hidden_dim,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            num_levels,
+            num_points_list,
+        )
+        self.decoder = TransformerDecoder(
+            hidden_dim,
+            decoder_layer,
+            decoder_layer_wide,
+            num_decoder_layers,
+            nhead,
+            reg_max,
+            self.reg_scale,
+            self.up,
+            eval_idx,
+        )
+        # denoising
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        if num_denoising > 0:
+            self.denoising_class_embed = nn.Embedding(num_classes + 1, hidden_dim, padding_idx=num_classes)
+            init.normal_(self.denoising_class_embed.weight[:-1])
+
+        # decoder embedding
+        self.query_pos_head = MLP(
+            input_dim=4,
+            hidden_dim=2 * hidden_dim,
+            output_dim=hidden_dim,
+            num_layers=2,
+            activation=partial(nn.ReLU, inplace=True),
+        )
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            OrderedDict(
+                [
+                    ("proj", nn.Linear(hidden_dim, hidden_dim)),
+                    ("norm", nn.LayerNorm(hidden_dim)),
+                ],
+            ),
+        )
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(
+            input_dim=hidden_dim,
+            hidden_dim=hidden_dim,
+            output_dim=4,
+            num_layers=3,
+            activation=partial(nn.ReLU, inplace=True),
+        )
+
+        # decoder head
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_decoder_layers + eval_idx
+        self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers)])
+        # distribution refinement over num of self.reg_max bins
+        self.dec_bbox_head = nn.ModuleList(
+            [
+                MLP(
+                    input_dim=hidden_dim,
+                    hidden_dim=hidden_dim,
+                    output_dim=4 * (self.reg_max + 1),
+                    num_layers=3,
+                    activation=partial(nn.ReLU, inplace=True),
+                )
+                for _ in range(num_decoder_layers)
+            ],
+        )
+        self.pre_bbox_head = MLP(
+            input_dim=hidden_dim,
+            hidden_dim=hidden_dim,
+            output_dim=4,
+            num_layers=3,
+            activation=partial(nn.ReLU, inplace=True),
+        )
+
+        self.integral = Integral(self.reg_max)
+
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+        self._reset_parameters(feat_channels)
+
+    def _reset_parameters(self, feat_channels: list[int]) -> None:
+        """Reset parameters of the module."""
+        bias = bias_init_with_prob(0.01)
+        init.constant_(self.enc_score_head.bias, bias)
+        init.constant_(self.enc_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.enc_bbox_head.layers[-1].bias, 0)
+
+        init.constant_(self.pre_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.pre_bbox_head.layers[-1].bias, 0)
+
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            init.constant_(cls_.bias, bias)
+            if hasattr(reg_, "layers"):
+                init.constant_(reg_.layers[-1].weight, 0)
+                init.constant_(reg_.layers[-1].bias, 0)
+
+        init.xavier_uniform_(self.enc_output[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for m, in_channels in zip(self.input_proj, feat_channels):
+            if in_channels != self.hidden_dim:
+                init.xavier_uniform_(m[0].weight)
+
+    def _build_input_proj_layer(self, feat_channels: list[int]) -> None:
+        """Build input projection layer."""
+        self.input_proj = nn.ModuleList()
+        for in_channels in feat_channels:
+            if in_channels == self.hidden_dim:
+                self.input_proj.append(nn.Identity())
+            else:
+                self.input_proj.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("conv", nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)),
+                                ("norm", nn.BatchNorm2d(self.hidden_dim)),
+                            ],
+                        ),
+                    ),
+                )
+
+        in_channels = feat_channels[-1]
+
+        for _ in range(self.num_levels - len(feat_channels)):
+            if in_channels == self.hidden_dim:
+                self.input_proj.append(nn.Identity())
+            else:
+                self.input_proj.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("conv", nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)),
+                                ("norm", nn.BatchNorm2d(self.hidden_dim)),
+                            ],
+                        ),
+                    ),
+                )
+                in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats: list[Tensor]) -> tuple[Tensor, list[list[int]]]:
+        """Flatten feature maps and get spatial shapes for encoder input.
+
+        Args:
+            feats (list[Tensor]): List of feature maps.
+
+        Returns:
+            tuple[Tensor, list[list[int]]]:
+                Tensor: Flattened feature maps.
+                list[list[int]]: List of spatial shapes for each feature map.
+        """
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        for feat in proj_feats:
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        return feat_flatten, spatial_shapes
+
+    def _generate_anchors(
+        self,
+        spatial_shapes: list[list[int]] | None = None,
+        grid_size: float = 0.05,
+        dtype: torch.dtype = torch.float32,
+        device: str = "cpu",
+    ) -> tuple[Tensor, Tensor]:
+        if spatial_shapes is None:
+            spatial_shapes = []
+            eval_h, eval_w = self.eval_spatial_size
+            for s in self.feat_strides:
+                spatial_shapes.append([int(eval_h / s), int(eval_w / s)])
+
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
+            anchors.append(lvl_anchors)
+
+        tensor_anchors = torch.concat(anchors, dim=1).to(device)
+        valid_mask = ((tensor_anchors > self.eps) * (tensor_anchors < 1 - self.eps)).all(-1, keepdim=True)
+        tensor_anchors = torch.log(tensor_anchors / (1 - tensor_anchors))
+        tensor_anchors = torch.where(valid_mask, tensor_anchors, torch.inf)
+
+        return tensor_anchors, valid_mask
+
+    def _get_decoder_input(
+        self,
+        memory: Tensor,
+        spatial_shapes: list[list[int]],
+        denoising_logits: Tensor | None = None,
+        denoising_bbox_unact: Tensor | None = None,
+    ) -> tuple[torch.Tensor, ...]:
+        # prepare input for decoder
+        if self.training or self.eval_spatial_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device)
+        else:
+            anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device)
+
+        if memory.shape[0] > 1:
+            anchors = anchors.repeat(memory.shape[0], 1, 1)
+
+        memory = valid_mask.to(memory.dtype) * memory
+
+        output_memory = self.enc_output(memory)
+        enc_outputs_logits = self.enc_score_head(output_memory)
+
+        enc_topk_bboxes_list, enc_topk_logits_list = [], []
+        enc_topk_memory, enc_topk_logits, enc_topk_anchors = self._select_topk(
+            output_memory,
+            enc_outputs_logits,
+            anchors,
+            self.num_queries,
+        )
+
+        enc_topk_bbox_unact = self.enc_bbox_head(enc_topk_memory) + enc_topk_anchors
+
+        if self.training:
+            enc_topk_bboxes = f.sigmoid(enc_topk_bbox_unact)
+            enc_topk_bboxes_list.append(enc_topk_bboxes)
+            enc_topk_logits_list.append(enc_topk_logits)
+
+            content = enc_topk_memory.detach()
+            content = enc_topk_memory.detach()
+
+        content = enc_topk_memory.detach()
+
+        enc_topk_bbox_unact = enc_topk_bbox_unact.detach()
+
+        if denoising_bbox_unact is not None:
+            enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1)
+            content = torch.concat([denoising_logits, content], dim=1)
+
+        return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list
+
+    def _select_topk(
+        self,
+        memory: Tensor,
+        outputs_logits: Tensor,
+        outputs_anchors_unact: Tensor,
+        topk: int,
+    ) -> tuple[Tensor, Tensor, Tensor]:
+        """Select top-k memory, logits, and anchors.
+
+        Args:
+            memory (Tensor): memory tensor.
+            outputs_logits (Tensor): logits tensor.
+            outputs_anchors_unact (Tensor): unactivated anchors tensor.
+            topk (int): number of top-k to select.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor]:
+                Tensor: top-k memory tensor.
+                Tensor: top-k logits tensor.
+                Tensor: top-k anchors tensor.
+        """
+        _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
+        topk_anchors = outputs_anchors_unact.gather(
+            dim=1,
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_anchors_unact.shape[-1]),
+        )
+
+        topk_logits = (
+            outputs_logits.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]))
+            if self.training
+            else None
+        )
+
+        topk_memory = memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, memory.shape[-1]))
+
+        return topk_memory, topk_logits, topk_anchors
+
+    def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None) -> dict[str, Tensor]:
+        """Forward pass of the DFine Transformer module."""
+        # input projection and embedding
+        memory, spatial_shapes = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training and self.num_denoising > 0 and targets is not None:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = get_contrastive_denoising_training_group(
+                targets,
+                self.num_classes,
+                self.num_queries,
+                self.denoising_class_embed,
+                num_denoising=self.num_denoising,
+                label_noise_ratio=self.label_noise_ratio,
+                box_noise_scale=1.0,
+            )
+        else:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = self._get_decoder_input(
+            memory,
+            spatial_shapes,
+            denoising_logits,
+            denoising_bbox_unact,
+        )
+
+        # decoder
+        out_bboxes, out_logits, out_corners, out_refs, pre_bboxes, pre_logits = self.decoder(
+            target=init_ref_contents,
+            ref_points_unact=init_ref_points_unact,
+            memory=memory,
+            spatial_shapes=spatial_shapes,
+            bbox_head=self.dec_bbox_head,
+            score_head=self.dec_score_head,
+            query_pos_head=self.query_pos_head,
+            pre_bbox_head=self.pre_bbox_head,
+            integral=self.integral,
+            reg_scale=self.reg_scale,
+            attn_mask=attn_mask,
+        )
+
+        if self.training and dn_meta is not None:
+            dn_pre_logits, pre_logits = torch.split(pre_logits, dn_meta["dn_num_split"], dim=1)
+            dn_pre_bboxes, pre_bboxes = torch.split(pre_bboxes, dn_meta["dn_num_split"], dim=1)
+            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta["dn_num_split"], dim=2)
+            dn_out_logits, out_logits = torch.split(out_logits, dn_meta["dn_num_split"], dim=2)
+
+            dn_out_corners, out_corners = torch.split(out_corners, dn_meta["dn_num_split"], dim=2)
+            dn_out_refs, out_refs = torch.split(out_refs, dn_meta["dn_num_split"], dim=2)
+
+        if self.training:
+            out = {
+                "pred_logits": out_logits[-1],
+                "pred_boxes": out_bboxes[-1],
+                "pred_corners": out_corners[-1],
+                "ref_points": out_refs[-1],
+                "up": self.up,
+                "reg_scale": self.reg_scale,
+            }
+            out["aux_outputs"] = self._set_aux_loss2(
+                outputs_class=out_logits[:-1],
+                outputs_coord=out_bboxes[:-1],
+                outputs_corners=out_corners[:-1],
+                outputs_ref=out_refs[:-1],
+                teacher_corners=out_corners[-1],
+                teacher_logits=out_logits[-1],
+            )
+            out["enc_aux_outputs"] = self._set_aux_loss(
+                enc_topk_logits_list,
+                enc_topk_bboxes_list,
+            )
+            out["pre_outputs"] = {
+                "pred_logits": pre_logits,
+                "pred_boxes": pre_bboxes,
+            }
+
+            if dn_meta is not None:
+                out["dn_outputs"] = self._set_aux_loss2(
+                    outputs_class=dn_out_logits,
+                    outputs_coord=dn_out_bboxes,
+                    outputs_corners=dn_out_corners,
+                    outputs_ref=dn_out_refs,
+                    teacher_corners=dn_out_corners[-1],
+                    teacher_logits=dn_out_logits[-1],
+                )
+                out["dn_pre_outputs"] = {
+                    "pred_logits": dn_pre_logits,
+                    "pred_boxes": dn_pre_bboxes,
+                }
+                out["dn_meta"] = dn_meta
+        else:
+            out = {
+                "pred_logits": out_logits[-1],
+                "pred_boxes": out_bboxes[-1],
+            }
+
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class: Tensor, outputs_coord: Tensor) -> list[dict[str, Tensor]]:
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)]
+
+    @torch.jit.unused
+    def _set_aux_loss2(
+        self,
+        outputs_class: Tensor,
+        outputs_coord: Tensor,
+        outputs_corners: Tensor,
+        outputs_ref: Tensor,
+        teacher_corners: Tensor,
+        teacher_logits: Tensor,
+    ) -> list[dict[str, Tensor]]:
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {
+                "pred_logits": a,
+                "pred_boxes": b,
+                "pred_corners": c,
+                "ref_points": d,
+                "teacher_corners": teacher_corners,
+                "teacher_logits": teacher_logits,
+            }
+            for a, b, c, d in zip(outputs_class, outputs_coord, outputs_corners, outputs_ref)
+        ]
+
+
+class DFINETransformer:
+    """DFINETransformer factory for detection."""
+
+    decoder_cfg: ClassVar[dict[str, Any]] = {
+        "dfine_hgnetv2_n": {
+            "feat_channels": [128, 128],
+            "feat_strides": [16, 32],
+            "hidden_dim": 128,
+            "dim_feedforward": 512,
+            "num_levels": 2,
+            "num_decoder_layers": 3,
+            "eval_idx": -1,
+            "num_points_list": [6, 6],
+            "eval_spatial_size": [640, 640],
+        },
+        "dfine_hgnetv2_s": {
+            "feat_channels": [256, 256, 256],
+            "num_decoder_layers": 3,
+            "eval_idx": -1,
+            "eval_spatial_size": [640, 640],
+            "num_points_list": [3, 6, 3],
+        },
+        "dfine_hgnetv2_m": {
+            "num_decoder_layers": 4,
+            "eval_idx": -1,
+            "eval_spatial_size": [640, 640],
+        },
+        "dfine_hgnetv2_l": {},
+        "dfine_hgnetv2_x": {
+            "feat_channels": [384, 384, 384],
+            "reg_scale": 8.0,
+            "eval_idx": -1,
+            "eval_spatial_size": [640, 640],
+        },
+    }
+
+    def __new__(cls, model_name: str, num_classes: int) -> DFINETransformerModule:
+        """Constructor for DFINETransformerModule."""
+        cfg = cls.decoder_cfg[model_name]
+        return DFINETransformerModule(num_classes=num_classes, **cfg)
diff --git a/src/otx/algo/detection/layers/csp_layer.py b/src/otx/algo/detection/layers/csp_layer.py
index 6a1c32f5693..d8f76a930a3 100644
--- a/src/otx/algo/detection/layers/csp_layer.py
+++ b/src/otx/algo/detection/layers/csp_layer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) OpenMMLab. All rights reserved.
 """Implementation of CSPLayer copied from mmdet.models.layers.csp_layer.py."""
@@ -193,7 +193,11 @@ def __init__(
             normalization=build_norm_layer(normalization, num_features=ch_out),
             activation=None,
         )
-        self.act = activation() if activation else nn.Identity()
+        if isinstance(activation, type):
+            activation = activation()
+        if activation is None:
+            activation = nn.Identity()
+        self.act = activation
 
     def forward(self, x: Tensor) -> Tensor:
         """Forward function."""
@@ -378,7 +382,7 @@ def __init__(
                 RepVggBlock(
                     hidden_channels,
                     hidden_channels,
-                    activation=activation,
+                    activation=build_activation_layer(activation),
                     normalization=normalization,
                 )
                 for _ in range(num_blocks)
diff --git a/src/otx/algo/detection/losses/__init__.py b/src/otx/algo/detection/losses/__init__.py
index 14ca6431030..44124aeddc4 100644
--- a/src/otx/algo/detection/losses/__init__.py
+++ b/src/otx/algo/detection/losses/__init__.py
@@ -1,9 +1,10 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """Custom OTX Losses for Object Detection."""
 
 from .atss_loss import ATSSCriterion
+from .dfine_loss import DFINECriterion
 from .rtdetr_loss import DetrCriterion
 from .rtmdet_loss import RTMDetCriterion
 from .ssd_loss import SSDCriterion
@@ -17,4 +18,5 @@
     "SSDCriterion",
     "YOLOv9Criterion",
     "YOLOXCriterion",
+    "DFINECriterion",
 ]
diff --git a/src/otx/algo/detection/losses/dfine_loss.py b/src/otx/algo/detection/losses/dfine_loss.py
new file mode 100644
index 00000000000..8c438922f18
--- /dev/null
+++ b/src/otx/algo/detection/losses/dfine_loss.py
@@ -0,0 +1,501 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""D-FINE criterion implementations. Modified from https://github.com/Peterande/D-FINE."""
+
+
+from __future__ import annotations
+
+from typing import Callable
+
+import torch
+import torch.distributed
+import torch.nn.functional as f
+from torch import Tensor, nn
+from torchvision.ops import box_convert
+
+from otx.algo.common.utils.assigners.hungarian_matcher import HungarianMatcher
+from otx.algo.common.utils.bbox_overlaps import bbox_overlaps
+from otx.algo.detection.utils.utils import dfine_bbox2distance
+
+
+class DFINECriterion(nn.Module):
+    """D-Fine criterion with FGL and DDF losses.
+
+    TODO(Eugene): Consider merge with RTDETRCriterion in the next PR.
+
+    The process happens in two steps:
+    1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+    2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+
+    Args:
+        weight_dict (dict[str, int | float]): A dictionary containing the weights for different loss components.
+        alpha (float, optional): The alpha parameter for the loss calculation. Defaults to 0.2.
+        gamma (float, optional): The gamma parameter for the loss calculation. Defaults to 2.0.
+        num_classes (int, optional): The number of classes. Defaults to 80.
+        reg_max (int, optional): The maximum number of bin targets. Defaults to 32.
+    """
+
+    def __init__(
+        self,
+        weight_dict: dict[str, int | float],
+        alpha: float = 0.2,
+        gamma: float = 2.0,
+        num_classes: int = 80,
+        reg_max: int = 32,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = HungarianMatcher(
+            cost_dict={
+                "cost_class": 2.0,
+                "cost_bbox": 5.0,
+                "cost_giou": 2.0,
+            },
+        )
+        self.weight_dict = weight_dict
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reg_max = reg_max
+        self.num_pos, self.num_neg = 0.0, 0.0
+
+    def loss_labels_vfl(
+        self,
+        outputs: dict[str, Tensor],
+        targets: list[dict[str, Tensor]],
+        indices: list[tuple[int, int]],
+        num_boxes: int,
+    ) -> dict[str, Tensor]:
+        """Varifocal Loss (VFL) for label prediction.
+
+        Args:
+            outputs (dict[str, Tensor]): Model outputs.
+            targets (List[Dict[str, Tensor]]): List of target dictionaries.
+            indices (List[Tuple[int, int]]): List of tuples of indices.
+            num_boxes (int): Number of predicted boxes.
+
+        Returns:
+            dict[str, Tensor]: The loss dictionary.
+        """
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        ious = bbox_overlaps(
+            box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
+            box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
+        )
+        ious = torch.diag(ious).detach()
+
+        src_logits = outputs["pred_logits"]
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = f.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = ious.to(target_score_o.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        pred_score = f.sigmoid(src_logits).detach()
+        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
+
+        loss = f.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction="none")
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {"loss_vfl": loss}
+
+    def loss_boxes(
+        self,
+        outputs: dict[str, Tensor],
+        targets: list[dict[str, Tensor]],
+        indices: list[tuple[int, int]],
+        num_boxes: int,
+    ) -> dict[str, Tensor]:
+        """Compute the losses re)L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+        The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+
+        Args:
+            outputs (dict[str, Tensor]): The outputs of the model.
+            targets (list[dict[str, Tensor]]): The targets.
+            indices (list[tuple[int, int]]): The indices of the matched boxes.
+            num_boxes (int): The number of boxes.
+
+        Returns:
+            dict[str, Tensor]: The losses.
+        """
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        losses = {}
+        loss_bbox = f.l1_loss(src_boxes, target_boxes, reduction="none")
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            bbox_overlaps(
+                box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
+                box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
+                mode="giou",
+            ),
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+
+        return losses
+
+    def loss_local(
+        self,
+        outputs: dict[str, Tensor],
+        targets: list[dict[str, Tensor]],
+        indices: list[tuple[int, int]],
+        num_boxes: int,
+        temperature: int = 5,
+    ) -> dict[str, Tensor]:
+        """Compute Fine-Grained Localization (FGL) Loss and Decoupled Distillation Focal (DDF) Loss.
+
+        Args:
+            outputs (dict[str, Tensor]): The outputs of the model.
+            targets (list[dict[str, Tensor]]): The targets.
+            indices (list[tuple[int, int]]): The indices of the matched boxes.
+            num_boxes (int): The number of boxes.
+            temperature (int, optional): Temperature for distillation. Defaults to 5.
+
+        Returns:
+            dict[str, Tensor]: FGL and DDF losses.
+        """
+        losses = {}
+        if "pred_corners" in outputs:
+            idx = self._get_src_permutation_idx(indices)
+            target_boxes = torch.cat(
+                [t["boxes"][i] for t, (_, i) in zip(targets, indices)],
+                dim=0,
+            )
+
+            pred_corners = outputs["pred_corners"][idx].reshape(-1, (self.reg_max + 1))
+            ref_points = outputs["ref_points"][idx].detach()
+            with torch.no_grad():
+                target_corners, weight_right, weight_left = dfine_bbox2distance(
+                    ref_points,
+                    box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
+                    self.reg_max,
+                    outputs["reg_scale"],
+                    outputs["up"],
+                )
+
+            ious = torch.diag(
+                bbox_overlaps(
+                    box_convert(outputs["pred_boxes"][idx], in_fmt="cxcywh", out_fmt="xyxy"),
+                    box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
+                ),
+            )
+            weight_targets = ious.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach()
+
+            losses["loss_fgl"] = DFINECriterion.fgl_loss(
+                pred_corners,
+                target_corners,
+                weight_right,
+                weight_left,
+                weight_targets,
+                avg_factor=num_boxes,
+            )
+
+            # Compute Decoupled Distillation Focal (DDF) Loss
+            if "teacher_corners" in outputs and outputs["teacher_corners"] is not None:
+                pred_corners = outputs["pred_corners"].reshape(-1, (self.reg_max + 1))
+                target_corners = outputs["teacher_corners"].reshape(-1, (self.reg_max + 1))
+                if torch.equal(pred_corners, target_corners):
+                    losses["loss_ddf"] = pred_corners.sum() * 0
+                else:
+                    weight_targets_local = outputs["teacher_logits"].sigmoid().max(dim=-1)[0]
+
+                    mask = torch.zeros_like(weight_targets_local, dtype=torch.bool)
+                    mask[idx] = True
+                    mask = mask.unsqueeze(-1).repeat(1, 1, 4).reshape(-1)
+
+                    weight_targets_local[idx] = ious.reshape_as(weight_targets_local[idx]).to(
+                        weight_targets_local.dtype,
+                    )
+                    weight_targets_local = weight_targets_local.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach()
+
+                    loss_match_local = (
+                        weight_targets_local
+                        * (temperature**2)
+                        * (
+                            nn.KLDivLoss(reduction="none")(
+                                f.log_softmax(pred_corners / temperature, dim=1),
+                                f.softmax(target_corners.detach() / temperature, dim=1),
+                            )
+                        ).sum(-1)
+                    )
+                    if "is_dn" not in outputs:
+                        batch_scale = 8 / outputs["pred_boxes"].shape[0]  # Avoid the influence of batch size per GPU
+                        self.num_pos, self.num_neg = (
+                            (mask.sum() * batch_scale) ** 0.5,
+                            ((~mask).sum() * batch_scale) ** 0.5,
+                        )
+                    loss_match_local1 = loss_match_local[mask].mean() if mask.any() else 0
+                    loss_match_local2 = loss_match_local[~mask].mean() if (~mask).any() else 0
+                    losses["loss_ddf"] = (loss_match_local1 * self.num_pos + loss_match_local2 * self.num_neg) / (
+                        self.num_pos + self.num_neg
+                    )
+
+        return losses
+
+    def _get_src_permutation_idx(
+        self,
+        indices: list[tuple[Tensor, Tensor]],
+    ) -> tuple[Tensor, Tensor]:
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_go_indices(
+        self,
+        indices: list[tuple[Tensor, Tensor]],
+        indices_aux_list: list[list[tuple[Tensor, Tensor]]],
+    ) -> list[Tensor]:
+        """Get a matching union set across all decoder layers.
+
+        Args:
+            indices: matching indices of the last decoder layer
+            indices_aux_list: matching indices of all decoder layers
+        """
+        results = []
+        for indices_aux in indices_aux_list:
+            indices = [
+                (torch.cat([idx1[0], idx2[0]]), torch.cat([idx1[1], idx2[1]]))
+                for idx1, idx2 in zip(indices.copy(), indices_aux.copy())
+            ]
+
+        for ind in [torch.cat([idx[0][:, None], idx[1][:, None]], 1) for idx in indices]:
+            unique, counts = torch.unique(ind, return_counts=True, dim=0)
+            count_sort_indices = torch.argsort(counts, descending=True)
+            unique_sorted = unique[count_sort_indices]
+            column_to_row = {}
+            for idx in unique_sorted:
+                row_idx, col_idx = idx[0].item(), idx[1].item()
+                if row_idx not in column_to_row:
+                    column_to_row[row_idx] = col_idx
+            final_rows = torch.tensor(list(column_to_row.keys()), device=ind.device)
+            final_cols = torch.tensor(list(column_to_row.values()), device=ind.device)
+            results.append((final_rows.long(), final_cols.long()))
+        return results
+
+    @property
+    def _available_losses(self) -> tuple[Callable]:
+        return (self.loss_boxes, self.loss_labels_vfl, self.loss_local)  # type: ignore[return-value]
+
+    def forward(
+        self,
+        outputs: dict[str, Tensor],
+        targets: list[dict[str, Tensor]],
+    ) -> dict[str, Tensor]:
+        """This performs the loss computation.
+
+        Args:
+            outputs (dict[str, torch.Tensor]): dict of tensors, see the output
+                specification of the model for the format
+            targets (list[dict[str, torch.Tensor]]): list of dicts, such that len(targets) == batch_size.
+                    The expected keys in each dict depends on the losses applied, see each loss' doc
+        Returns:
+            dict[str, torch.Tensor]: dict of losses
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if "aux" not in k}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Get the matching union set across all decoder layers.
+        indices_aux_list, cached_indices, cached_indices_enc = [], [], []
+        for aux_outputs in outputs["aux_outputs"] + [outputs["pre_outputs"]]:
+            indices_aux = self.matcher(aux_outputs, targets)
+            cached_indices.append(indices_aux)
+            indices_aux_list.append(indices_aux)
+        for aux_outputs in outputs["enc_aux_outputs"]:
+            indices_enc = self.matcher(aux_outputs, targets)
+            cached_indices_enc.append(indices_enc)
+            indices_aux_list.append(indices_enc)
+        indices_go = self._get_go_indices(indices, indices_aux_list)
+
+        num_boxes_go = sum(len(x[0]) for x in indices_go)
+        num_boxes_go = torch.as_tensor(
+            [num_boxes_go],
+            dtype=torch.float,
+            device=next(iter(outputs.values())).device,
+        )
+        num_boxes_go = torch.clamp(num_boxes_go, min=1).item()
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self._available_losses:
+            indices_in = indices_go if loss in [self.loss_boxes, self.loss_local] else indices
+            num_boxes_in = num_boxes_go if loss in [self.loss_boxes, self.loss_local] else num_boxes
+            l_dict = loss(outputs, targets, indices_in, num_boxes_in)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                aux_outputs["up"], aux_outputs["reg_scale"] = outputs["up"], outputs["reg_scale"]
+                for loss in self._available_losses:
+                    if loss in [self.loss_boxes, self.loss_local]:
+                        indices_in = indices_go
+                        num_boxes_in = num_boxes_go
+                    else:
+                        indices_in = cached_indices[i]
+                        num_boxes_in = num_boxes
+                    l_dict = loss(aux_outputs, targets, indices_in, num_boxes_in)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f"_aux_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of auxiliary traditional head output at first decoder layer.
+        if "pre_outputs" in outputs:
+            aux_outputs = outputs["pre_outputs"]
+            for loss in self._available_losses:
+                if loss in [self.loss_boxes, self.loss_local]:
+                    indices_in = indices_go
+                    num_boxes_in = num_boxes_go
+                else:
+                    indices_in = cached_indices[-1]
+                    num_boxes_in = num_boxes
+                l_dict = loss(aux_outputs, targets, indices_in, num_boxes_in)
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + "_pre": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        # In case of encoder auxiliary losses.
+        if "enc_aux_outputs" in outputs:
+            enc_targets = targets
+            for i, aux_outputs in enumerate(outputs["enc_aux_outputs"]):
+                for loss in self._available_losses:
+                    if loss == self.loss_boxes:
+                        indices_in = indices_go
+                        num_boxes_in = num_boxes_go
+                    else:
+                        indices_in = cached_indices_enc[i]
+                        num_boxes_in = num_boxes
+                    l_dict = loss(aux_outputs, enc_targets, indices_in, num_boxes_in)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f"_enc_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of cdn auxiliary losses. For dfine
+        if "dn_outputs" in outputs:
+            indices_dn = self.get_cdn_matched_indices(outputs["dn_meta"], targets)
+            dn_num_boxes = num_boxes * outputs["dn_meta"]["dn_num_group"]
+            dn_num_boxes = dn_num_boxes if dn_num_boxes > 0 else 1
+
+            for i, aux_outputs in enumerate(outputs["dn_outputs"]):
+                aux_outputs["is_dn"] = True
+                aux_outputs["up"], aux_outputs["reg_scale"] = outputs["up"], outputs["reg_scale"]
+                for loss in self._available_losses:
+                    l_dict = loss(aux_outputs, targets, indices_dn, dn_num_boxes)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f"_dn_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+            # In case of auxiliary traditional head output at first decoder layer.
+            if "dn_pre_outputs" in outputs:
+                aux_outputs = outputs["dn_pre_outputs"]
+                for loss in self._available_losses:
+                    l_dict = loss(aux_outputs, targets, indices_dn, dn_num_boxes)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + "_dn_pre": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+    @staticmethod
+    def get_cdn_matched_indices(
+        dn_meta: dict[str, list[Tensor]],
+        targets: list[dict[str, Tensor]],
+    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
+        """get_cdn_matched_indices.
+
+        Args:
+            dn_meta (dict[str, list[torch.Tensor]]): meta data for cdn
+            targets (list[dict[str, torch.Tensor]]): targets
+        """
+        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+        num_gts = [len(t["labels"]) for t in targets]
+        device = targets[0]["labels"].device
+
+        dn_match_indices = []
+        for i, num_gt in enumerate(num_gts):
+            if num_gt > 0:
+                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
+                gt_idx = gt_idx.tile(dn_num_group)
+                if len(dn_positive_idx[i]) != len(gt_idx):
+                    msg = "The number of positive indices should be equal to the number of ground truths."
+                    raise ValueError(msg)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append(
+                    (
+                        torch.zeros(0, dtype=torch.int64, device=device),
+                        torch.zeros(0, dtype=torch.int64, device=device),
+                    ),
+                )
+
+        return dn_match_indices
+
+    @staticmethod
+    def fgl_loss(
+        preds: Tensor,
+        targets: Tensor,
+        weight_right: Tensor,
+        weight_left: Tensor,
+        iou_weight: Tensor | None = None,
+        reduction: str = "sum",
+        avg_factor: float | None = None,
+    ) -> Tensor:
+        """Fine-Grained Localization (FGL) Loss.
+
+        Args:
+            preds (Tensor): predicted distances
+            targets (Tensor): target distances
+            weight_right (Tensor): weight for right distance
+            weight_left (Tensor): weight for left distance
+            iou_weight (Tensor, optional): IoU weight. Defaults to None.
+            reduction (str, optional): reduction method. Defaults to "sum".
+            avg_factor (float, optional): average factor. Defaults to None.
+
+        Returns:
+            Tensor: FGL loss
+        """
+        dis_left = targets.long()
+        dis_right = dis_left + 1
+
+        loss_left = f.cross_entropy(
+            preds,
+            dis_left,
+            reduction="none",
+        ) * weight_left.reshape(-1)
+
+        loss_right = f.cross_entropy(
+            preds,
+            dis_right,
+            reduction="none",
+        ) * weight_right.reshape(-1)
+
+        loss = loss_left + loss_right
+
+        if iou_weight is not None:
+            iou_weight = iou_weight.float()
+            loss = loss * iou_weight
+
+        if avg_factor is not None:
+            loss = loss.sum() / avg_factor
+        elif reduction == "mean":
+            loss = loss.mean()
+        elif reduction == "sum":
+            loss = loss.sum()
+
+        return loss
diff --git a/src/otx/algo/detection/necks/dfine_hybrid_encoder.py b/src/otx/algo/detection/necks/dfine_hybrid_encoder.py
new file mode 100644
index 00000000000..918cdfff878
--- /dev/null
+++ b/src/otx/algo/detection/necks/dfine_hybrid_encoder.py
@@ -0,0 +1,438 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""D-FINE Hybrid Encoder. Modified from D-FINE (https://github.com/Peterande/D-FINE)."""
+
+from __future__ import annotations
+
+import copy
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, ClassVar
+
+import torch
+import torch.nn.functional as f
+from torch import Tensor, nn
+
+from otx.algo.common.layers.transformer_layers import TransformerEncoder, TransformerEncoderLayer
+from otx.algo.detection.layers.csp_layer import CSPRepLayer
+from otx.algo.detection.utils.utils import auto_pad
+from otx.algo.modules.activation import build_activation_layer
+from otx.algo.modules.conv_module import Conv2dModule
+from otx.algo.modules.norm import build_norm_layer
+
+
+class SCDown(nn.Module):
+    """SCDown downsampling module.
+
+    Args:
+        c1 (int): Number of channels in the input feature map.
+        c2 (int): Number of channels produced by the convolution.
+        k (int): Kernel size of the convolving kernel.
+        s (int): Stride of the convolution.
+        normalization (Callable[..., nn.Module] | None): Normalization layer module.
+    """
+
+    def __init__(
+        self,
+        c1: int,
+        c2: int,
+        k: int,
+        s: int,
+        normalization: Callable[..., nn.Module] | None = None,
+    ) -> None:
+        super().__init__()
+        self.cv1 = Conv2dModule(
+            c1,
+            c2,
+            1,
+            1,
+            normalization=build_norm_layer(normalization, num_features=c2),
+            activation=None,
+        )
+        self.cv2 = Conv2dModule(
+            c2,
+            c2,
+            k,
+            s,
+            padding=auto_pad(kernel_size=k),
+            groups=c2,
+            normalization=build_norm_layer(normalization, num_features=c2),
+            activation=None,
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward pass."""
+        return self.cv2(self.cv1(x))
+
+
+class RepNCSPELAN4(nn.Module):
+    """GELANModule from YOLOv9.
+
+    Note:
+        Might not be replaceable as layer implementation is very different from GELANModule in YOLOv9.
+
+    Args:
+        c1 (int): c1 channel size. Refer to GELAN paper.
+        c2 (int): c2 channel size. Refer to GELAN paper.
+        c3 (int): c3 channel size. Refer to GELAN paper.
+        c4 (int): c4 channel size. Refer to GELAN paper.
+        n (int, optional): number of blocks. Defaults to 3.
+        bias (bool, optional): use bias. Defaults to False.
+        activation (Callable[..., nn.Module] | None, optional): activation function. Defaults to None.
+        normalization (Callable[..., nn.Module] | None, optional): norm layer. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        c1: int,
+        c2: int,
+        c3: int,
+        c4: int,
+        num_blocks: int = 3,
+        bias: bool = False,
+        activation: Callable[..., nn.Module] | None = None,
+        normalization: Callable[..., nn.Module] | None = None,
+    ) -> None:
+        super().__init__()
+        self.c = c3 // 2
+
+        self.cv1 = Conv2dModule(
+            c1,
+            c3,
+            1,
+            1,
+            bias=bias,
+            activation=build_activation_layer(activation),
+            normalization=build_norm_layer(normalization, num_features=c3),
+        )
+
+        self.cv2 = nn.Sequential(
+            CSPRepLayer(
+                c3 // 2,
+                c4,
+                num_blocks,
+                1,
+                bias=bias,
+                activation=activation,
+                normalization=normalization,
+            ),
+            Conv2dModule(
+                c4,
+                c4,
+                3,
+                1,
+                padding=auto_pad(kernel_size=3),
+                bias=bias,
+                activation=build_activation_layer(activation),
+                normalization=build_norm_layer(normalization, num_features=c4),
+            ),
+        )
+
+        self.cv3 = nn.Sequential(
+            CSPRepLayer(
+                c4,
+                c4,
+                num_blocks,
+                1,
+                bias=bias,
+                activation=activation,
+                normalization=normalization,
+            ),
+            Conv2dModule(
+                c4,
+                c4,
+                3,
+                1,
+                padding=auto_pad(kernel_size=3),
+                bias=bias,
+                activation=build_activation_layer(activation),
+                normalization=build_norm_layer(normalization, num_features=c4),
+            ),
+        )
+
+        self.cv4 = Conv2dModule(
+            c3 + (2 * c4),
+            c2,
+            1,
+            1,
+            bias=bias,
+            activation=build_activation_layer(activation),
+            normalization=build_norm_layer(normalization, num_features=c2),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward pass."""
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+
+
+class HybridEncoderModule(nn.Module):
+    """HybridEncoder for DFine.
+
+    TODO(Eugene): Merge with current rtdetr.HybridEncoderModule in next PR.
+
+    Args:
+        in_channels (list[int], optional): List of input channels for each feature map.
+            Defaults to [512, 1024, 2048].
+        feat_strides (list[int], optional): List of stride values for
+            each feature map. Defaults to [8, 16, 32].
+        hidden_dim (int, optional): Hidden dimension size. Defaults to 256.
+        nhead (int, optional): Number of attention heads in the transformer encoder.
+                Defaults to 8.
+        dim_feedforward (int, optional): Dimension of the feedforward network
+            in the transformer encoder. Defaults to 1024.
+        dropout (float, optional): Dropout rate. Defaults to 0.0.
+        enc_activation (Callable[..., nn.Module]): Activation layer module.
+            Defaults to ``nn.GELU``.
+        normalization (Callable[..., nn.Module]): Normalization layer module.
+            Defaults to ``partial(build_norm_layer, nn.BatchNorm2d, layer_name="norm")``.
+        use_encoder_idx (list[int], optional): List of indices of the encoder to use.
+            Defaults to [2].
+        num_encoder_layers (int, optional): Number of layers in the transformer encoder.
+            Defaults to 1.
+        pe_temperature (float, optional): Temperature parameter for positional encoding.
+            Defaults to 10000.
+        expansion (float, optional): Expansion factor for the CSPRepLayer.
+            Defaults to 1.0.
+        depth_mult (float, optional): Depth multiplier for the CSPRepLayer.
+            Defaults to 1.0.
+        activation (Callable[..., nn.Module]): Activation layer module.
+            Defaults to ``nn.SiLU``.
+        eval_spatial_size (tuple[int, int] | None, optional): Spatial size for
+            evaluation. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_channels: list[int] = [512, 1024, 2048],  # noqa: B006
+        feat_strides: list[int] = [8, 16, 32],  # noqa: B006
+        hidden_dim: int = 256,
+        nhead: int = 8,
+        dim_feedforward: int = 1024,
+        dropout: float = 0.0,
+        enc_activation: Callable[..., nn.Module] = nn.GELU,
+        normalization: Callable[..., nn.Module] = partial(build_norm_layer, nn.BatchNorm2d, layer_name="norm"),
+        use_encoder_idx: list[int] = [2],  # noqa: B006
+        num_encoder_layers: int = 1,
+        pe_temperature: int = 10000,
+        expansion: float = 1.0,
+        depth_mult: float = 1.0,
+        activation: Callable[..., nn.Module] = nn.SiLU,
+        eval_spatial_size: tuple[int, int] | None = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_spatial_size = eval_spatial_size
+        self.out_channels = [hidden_dim for _ in range(len(in_channels))]
+        self.out_strides = feat_strides
+
+        # channel projection
+        self.input_proj = nn.ModuleList()
+        for in_channel in in_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    OrderedDict(
+                        [
+                            ("conv", nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)),
+                            ("norm", nn.BatchNorm2d(hidden_dim)),
+                        ],
+                    ),
+                ),
+            )
+
+        # encoder transformer
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=enc_activation,
+        )
+
+        self.encoder = nn.ModuleList(
+            [TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx))],
+        )
+
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(
+                Conv2dModule(
+                    hidden_dim,
+                    hidden_dim,
+                    1,
+                    1,
+                    normalization=build_norm_layer(normalization, num_features=hidden_dim),
+                    activation=None,
+                ),
+            )
+            self.fpn_blocks.append(
+                RepNCSPELAN4(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    hidden_dim * 2,
+                    round(expansion * hidden_dim // 2),
+                    round(3 * depth_mult),
+                    activation=activation,
+                    normalization=normalization,
+                ),
+            )
+
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                nn.Sequential(
+                    SCDown(
+                        hidden_dim,
+                        hidden_dim,
+                        3,
+                        2,
+                        normalization=normalization,
+                    ),
+                ),
+            )
+            self.pan_blocks.append(
+                RepNCSPELAN4(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    hidden_dim * 2,
+                    round(expansion * hidden_dim // 2),
+                    round(3 * depth_mult),
+                    activation=activation,
+                    normalization=normalization,
+                ),
+            )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self) -> None:
+        """Reset parameters."""
+        if self.eval_spatial_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_spatial_size[1] // stride,
+                    self.eval_spatial_size[0] // stride,
+                    self.hidden_dim,
+                    self.pe_temperature,
+                )
+                setattr(self, f"pos_embed{idx}", pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(
+        w: int,
+        h: int,
+        embed_dim: int = 256,
+        temperature: float = 10000.0,
+    ) -> Tensor:
+        """Build 2D sin-cos position embedding."""
+        grid_w = torch.arange(int(w), dtype=torch.float32)
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        if embed_dim % 4 != 0:
+            msg = "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
+            raise ValueError(msg)
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(self, feats: Tensor) -> list[Tensor]:
+        """Forward pass."""
+        if len(feats) != len(self.in_channels):
+            msg = f"Input feature size {len(feats)} does not match the number of input channels {len(self.in_channels)}"
+            raise ValueError(msg)
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_spatial_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(w, h, self.hidden_dim, self.pe_temperature).to(
+                        src_flatten.device,
+                    )
+                else:
+                    pos_embed = getattr(self, f"pos_embed{enc_ind}").to(src_flatten.device)
+
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
+
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
+            inner_outs[0] = feat_heigh
+            upsample_feat = f.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_outs.insert(0, inner_out)
+
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1))
+            outs.append(out)
+
+        return outs
+
+
+class HybridEncoder:
+    """HybridEncoder factory for D-Fine detection."""
+
+    encoder_cfg: ClassVar[dict[str, Any]] = {
+        "dfine_hgnetv2_n": {
+            "in_channels": [512, 1024],
+            "feat_strides": [16, 32],
+            "hidden_dim": 128,
+            "use_encoder_idx": [1],
+            "dim_feedforward": 512,
+            "expansion": 0.34,
+            "depth_mult": 0.5,
+            "eval_spatial_size": [640, 640],
+        },
+        "dfine_hgnetv2_s": {
+            "in_channels": [256, 512, 1024],
+            "hidden_dim": 256,
+            "expansion": 0.5,
+            "depth_mult": 0.34,
+            "eval_spatial_size": [640, 640],
+        },
+        "dfine_hgnetv2_m": {
+            "in_channels": [384, 768, 1536],
+            "hidden_dim": 256,
+            "depth_mult": 0.67,
+            "eval_spatial_size": [640, 640],
+        },
+        "dfine_hgnetv2_l": {},
+        "dfine_hgnetv2_x": {
+            "hidden_dim": 384,
+            "dim_feedforward": 2048,
+        },
+    }
+
+    def __new__(cls, model_name: str) -> HybridEncoderModule:
+        """Constructor for HybridEncoder."""
+        if model_name not in cls.encoder_cfg:
+            msg = f"model type '{model_name}' is not supported"
+            raise KeyError(msg)
+        return HybridEncoderModule(**cls.encoder_cfg[model_name])
diff --git a/src/otx/algo/detection/utils/utils.py b/src/otx/algo/detection/utils/utils.py
index 143ea1571cd..78c73f7eace 100644
--- a/src/otx/algo/detection/utils/utils.py
+++ b/src/otx/algo/detection/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) OpenMMLab. All rights reserved.
 """Utils for otx detection algo.
@@ -6,6 +6,7 @@
 Reference :
     - https://github.com/open-mmlab/mmdetection/blob/v3.2.0/mmdet/models/utils.
     - https://github.com/open-mmlab/mmdeploy/blob/v1.3.1/mmdeploy/codebase/mmdet/structures/bbox/transforms.
+    - https://github.com/Peterande/D-FINE/blob/master/src/zoo/dfine/dfine_utils.py
 """
 
 from __future__ import annotations
@@ -16,6 +17,7 @@
 from einops import rearrange
 from torch import Tensor, nn
 from torch.autograd import Function
+from torchvision.ops import box_convert
 
 from otx.algo.utils.mmengine_utils import InstanceData
 from otx.core.data.entity.detection import DetBatchDataEntity
@@ -334,3 +336,164 @@ def set_info_into_instance(layer_dict: dict[str, Any]) -> nn.Module:
     for k, v in layer_dict.items():
         setattr(layer, k, v)
     return layer
+
+
+def dfine_weighting_function(reg_max: int, up: Tensor, reg_scale: Tensor) -> Tensor:
+    """Generates the non-uniform Weighting Function W(n) for bounding box regression.
+
+    Args:
+        reg_max (int): Max number of the discrete bins.
+        up (Tensor): Controls upper bounds of the sequence, where maximum offset is ±up * H / W.
+        reg_scale (Tensor): Controls the curvature of the Weighting Function.
+                        Larger values result in flatter weights near the central axis W(reg_max/2)=0
+                        and steeper weights at both ends.
+        deploy (bool): If True, uses deployment mode settings.
+
+    Returns:
+        Tensor: Sequence of Weighting Function.
+    """
+    upper_bound1 = abs(up[0]) * abs(reg_scale)
+    upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
+    step = (upper_bound1 + 1) ** (2 / (reg_max - 2))
+    left_values = [-((step) ** i) + 1 for i in range(reg_max // 2 - 1, 0, -1)]
+    right_values = [(step) ** i - 1 for i in range(1, reg_max // 2)]
+    return torch.cat(
+        [
+            -upper_bound2,
+            torch.cat(left_values),
+            torch.zeros_like(up[0][None]),
+            torch.cat(right_values),
+            upper_bound2,
+        ],
+        0,
+    )
+
+
+def dfine_distance2bbox(points: Tensor, distance: Tensor, reg_scale: Tensor) -> Tensor:
+    """Decodes edge-distances into bounding box coordinates.
+
+    Args:
+        points (Tensor): (B, N, 4) or (N, 4) format, representing [x, y, w, h],
+                        where (x, y) is the center and (w, h) are width and height.
+        distance (Tensor): (B, N, 4) or (N, 4), representing distances from the
+                        point to the left, top, right, and bottom boundaries.
+
+        reg_scale (float): Controls the curvature of the Weighting Function.
+
+    Returns:
+        Tensor: Bounding boxes in (N, 4) or (B, N, 4) format [cx, cy, w, h].
+    """
+    reg_scale = abs(reg_scale)
+    x1 = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
+    y1 = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
+    x2 = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
+    y2 = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+    return box_convert(bboxes, in_fmt="xyxy", out_fmt="cxcywh")
+
+
+def dfine_bbox2distance(
+    points: Tensor,
+    bbox: Tensor,
+    reg_max: int,
+    reg_scale: Tensor,
+    up: Tensor,
+    eps: float = 0.1,
+) -> tuple[Tensor, Tensor, Tensor]:
+    """Converts bounding box coordinates to distances from a reference point.
+
+    Refer to D-Fine: https://github.com/Peterande/D-FINE.
+
+    Args:
+        points (Tensor): (n, 4) [x, y, w, h], where (x, y) is the center.
+        bbox (Tensor): (n, 4) bounding boxes in "xyxy" format.
+        reg_max (float): Maximum bin value.
+        reg_scale (float): Controling curvarture of W(n).
+        up (Tensor): Controling upper bounds of W(n).
+        eps (float): Small value to ensure target < reg_max.
+
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]:
+            - indices (Tensor): Index of the left bin closest to each GT value, shape (N, ).
+            - weight_right (Tensor): Weight assigned to the right bin, shape (N, ).
+            - weight_left (Tensor): Weight assigned to the left bin, shape (N, ).
+    """
+
+    def _translate_gt(gt: Tensor, reg_max: int, reg_scale: Tensor, up: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+        """Decodes bounding box ground truth (GT) values into distribution-based GT representations.
+
+        This function maps continuous GT values into discrete distribution bins, which can be used
+        for regression tasks in object detection models.
+
+        It calculates the indices of the closest bins to each GT value and assigns interpolation weights
+        to these bins based on their proximity to the GT value.
+
+        In the paper:
+            'a' (up) controlling the upper bounds.
+            'c' (reg_scale) controlling the curvature.
+
+        Args:
+            gt (Tensor): Ground truth bounding box values, shape (N, ).
+            reg_max (int): Maximum number of discrete bins for the distribution.
+            reg_scale (Tensor): Controls the curvature of the Weighting Function.
+            up (Tensor): Controls the upper bounds of the Weighting Function.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+                - indices (Tensor): Index of the left bin closest to each GT value, shape (N, ).
+                - weight_right (Tensor): Weight assigned to the right bin, shape (N, ).
+                - weight_left (Tensor): Weight assigned to the left bin, shape (N, ).
+        """
+        gt = gt.reshape(-1)
+        function_values = dfine_weighting_function(reg_max, up, reg_scale)
+
+        # Find the closest left-side indices for each value
+        diffs = function_values.unsqueeze(0) - gt.unsqueeze(1)
+        mask = diffs <= 0
+        closest_left_indices = torch.sum(mask, dim=1) - 1
+
+        # Calculate the weights for the interpolation
+        indices = closest_left_indices.float()
+
+        weight_right = torch.zeros_like(indices)
+        weight_left = torch.zeros_like(indices)
+
+        valid_idx_mask = (indices >= 0) & (indices < reg_max)
+        valid_indices = indices[valid_idx_mask].long()
+
+        # Obtain distances
+        left_values = function_values[valid_indices]
+        right_values = function_values[valid_indices + 1]
+
+        left_diffs = torch.abs(gt[valid_idx_mask] - left_values)
+        right_diffs = torch.abs(right_values - gt[valid_idx_mask])
+
+        # Valid weights
+        weight_right[valid_idx_mask] = left_diffs / (left_diffs + right_diffs)
+        weight_left[valid_idx_mask] = 1.0 - weight_right[valid_idx_mask]
+
+        # Invalid weights (out of range)
+        invalid_idx_mask_neg = indices < 0
+        weight_right[invalid_idx_mask_neg] = 0.0
+        weight_left[invalid_idx_mask_neg] = 1.0
+        indices[invalid_idx_mask_neg] = 0.0
+
+        invalid_idx_mask_pos = indices >= reg_max
+        weight_right[invalid_idx_mask_pos] = 1.0
+        weight_left[invalid_idx_mask_pos] = 0.0
+        indices[invalid_idx_mask_pos] = reg_max - 0.1
+
+        return indices, weight_right, weight_left
+
+    reg_scale = abs(reg_scale)
+    # ϕ = (dᴳᵀ- d⁰) / {H, H, W, W}
+    left = (points[:, 0] - bbox[:, 0]) / (points[..., 2] / reg_scale + 1e-16) - 0.5 * reg_scale
+    top = (points[:, 1] - bbox[:, 1]) / (points[..., 3] / reg_scale + 1e-16) - 0.5 * reg_scale
+    right = (bbox[:, 2] - points[:, 0]) / (points[..., 2] / reg_scale + 1e-16) - 0.5 * reg_scale
+    bottom = (bbox[:, 3] - points[:, 1]) / (points[..., 3] / reg_scale + 1e-16) - 0.5 * reg_scale
+    four_lens = torch.stack([left, top, right, bottom], -1)
+    four_lens, weight_right, weight_left = _translate_gt(four_lens, reg_max, reg_scale, up)
+    if reg_max is not None:
+        four_lens = four_lens.clamp(min=0, max=reg_max - eps)
+    return four_lens.reshape(-1).detach(), weight_right.detach(), weight_left.detach()
diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
index 8fdf26736a7..ca51bb71725 100644
--- a/src/otx/core/data/transform_libs/torchvision.py
+++ b/src/otx/core/data/transform_libs/torchvision.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024 Intel Corporation
+# Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 """Helper to support TorchVision data transform functions."""
 
@@ -3921,3 +3921,44 @@ def _dispatch_transform(cls, cfg_transform: DictConfig | dict | tvt_v2.Transform
             raise TypeError(msg)
 
         return transform
+
+
+class RandomIoUCrop(tvt_v2.RandomIoUCrop):
+    """Random IoU crop with the option to set probability.
+
+    Args:
+        min_scale (float, optional): the same as RandomIoUCrop. Defaults to 0.3.
+        max_scale (float, optional): the same as RandomIoUCrop. Defaults to 1.
+        min_aspect_ratio (float, optional): the same as RandomIoUCrop. Defaults to 0.5.
+        max_aspect_ratio (float, optional): the same as RandomIoUCrop. Defaults to 2.
+        sampler_options (list[float] | None, optional): the same as RandomIoUCrop. Defaults to None.
+        trials (int, optional): the same as RandomIoUCrop. Defaults to 40.
+        p (float, optional): probability. Defaults to 1.0.
+    """
+
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2,
+        sampler_options: list[float] | None = None,
+        trials: int = 40,
+        p: float = 1.0,
+    ):
+        super().__init__(
+            min_scale,
+            max_scale,
+            min_aspect_ratio,
+            max_aspect_ratio,
+            sampler_options,
+            trials,
+        )
+        self.p = p
+
+    def __call__(self, *inputs: Any) -> Any:  # noqa: ANN401
+        """Apply the transform to the given inputs."""
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+
+        return super().forward(*inputs)
diff --git a/src/otx/recipe/detection/dfine_x.yaml b/src/otx/recipe/detection/dfine_x.yaml
new file mode 100644
index 00000000000..4be4342e94c
--- /dev/null
+++ b/src/otx/recipe/detection/dfine_x.yaml
@@ -0,0 +1,129 @@
+model:
+  class_path: otx.algo.detection.d_fine.DFine
+  init_args:
+    model_name: dfine_hgnetv2_x
+    label_info: 80
+    multi_scale: true
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.00025
+        betas: [0.9, 0.999]
+        weight_decay: 0.000125
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 100
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 6
+            monitor: val/map_50
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/torchvision_base.yaml
+overrides:
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
+      init_args:
+        max_interval: 1
+        min_lrschedule_patience: 3
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        warmup_iters: 100
+        warmup_epochs: 7
+
+  data:
+    input_size:
+      - 640
+      - 640
+    task: DETECTION
+    stack_images: true
+    data_format: coco_instances
+    train_subset:
+      batch_size: 8
+      num_workers: 4
+      to_tv_image: true
+      transforms:
+        - class_path: torchvision.transforms.v2.RandomPhotometricDistort
+          init_args:
+            p: 0.5
+        - class_path: torchvision.transforms.v2.RandomZoomOut
+          init_args:
+            fill: 0
+        - class_path: otx.core.data.transform_libs.torchvision.RandomIoUCrop
+          init_args:
+            p: 0.8
+        - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+          init_args:
+            min_size: 1
+        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+          init_args:
+            prob: 0.5
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            transform_bbox: true
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+          enable: false
+          init_args:
+            is_numpy_to_tvtensor: true
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
+          init_args:
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+            scale: true
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
+        - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+          init_args:
+            min_size: 1
+      sampler:
+        class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
+
+    val_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+            scale: true
+
+    test_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+            scale: true
diff --git a/src/otx/recipe/detection/dfine_x_tile.yaml b/src/otx/recipe/detection/dfine_x_tile.yaml
new file mode 100644
index 00000000000..74523c361de
--- /dev/null
+++ b/src/otx/recipe/detection/dfine_x_tile.yaml
@@ -0,0 +1,125 @@
+model:
+  class_path: otx.algo.detection.d_fine.DFine
+  init_args:
+    model_name: dfine_hgnetv2_x
+    label_info: 80
+    multi_scale: false
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.00025
+        betas: [0.9, 0.999]
+        weight_decay: 0.000125
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 20
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 9
+            monitor: val/map_50
+            min_lr: 2e-06
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/detection_tile.yaml
+overrides:
+  reset:
+    - data.train_subset.transforms
+    - data.val_subset.transforms
+    - data.test_subset.transforms
+
+  data:
+    input_size:
+      - 640
+      - 640
+    task: DETECTION
+    stack_images: true
+    data_format: coco_instances
+    train_subset:
+      batch_size: 8
+      num_workers: 4
+      to_tv_image: true
+      transforms:
+        - class_path: torchvision.transforms.v2.RandomPhotometricDistort
+          init_args:
+            p: 0.5
+        - class_path: torchvision.transforms.v2.RandomZoomOut
+          init_args:
+            fill: 0
+        - class_path: otx.core.data.transform_libs.torchvision.RandomIoUCrop
+          init_args:
+            p: 0.8
+        - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+          init_args:
+            min_size: 1
+        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+          init_args:
+            prob: 0.5
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            transform_bbox: true
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+          enable: false
+          init_args:
+            is_numpy_to_tvtensor: true
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
+          init_args:
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+            scale: true
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
+        - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+          init_args:
+            min_size: 1
+      sampler:
+        class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
+
+    val_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+            scale: true
+
+    test_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+            scale: true
diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index 1e670930cf0..fdbb7c73f00 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -97,6 +97,10 @@
         "task": OTXTaskType.DETECTION,
         "model_name": "rtmdet_tiny",
     },
+    "Object_Detection_DFine_X": {
+        "task": OTXTaskType.DETECTION,
+        "model_name": "dfine_x",
+    },
     # INSTANCE_SEGMENTATION
     "Custom_Counting_Instance_Segmentation_MaskRCNN_ResNet50": {
         "task": OTXTaskType.INSTANCE_SEGMENTATION,
diff --git a/src/otx/tools/templates/detection/detection/dfine_x/template.yaml b/src/otx/tools/templates/detection/detection/dfine_x/template.yaml
new file mode 100644
index 00000000000..459395bdd7f
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/dfine_x/template.yaml
@@ -0,0 +1,46 @@
+# Description.
+model_template_id: Object_Detection_DFine_X
+name: DFine-X
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for DFine-X
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.00025
+        auto_hpo_state: POSSIBLE
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 202.486
+size: 240.0
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/tests/integration/api/test_xai.py b/tests/integration/api/test_xai.py
index 8a565233f4a..5408cd5049e 100644
--- a/tests/integration/api/test_xai.py
+++ b/tests/integration/api/test_xai.py
@@ -51,6 +51,11 @@ def test_forward_explain(
         # TODO(Eugene): maskdino not support yet.
         pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.")
 
+    if "dfine" in model_name:
+        # TODO(Eugene): dfine not support yet.
+        # https://jira.devtools.intel.com/browse/CVS-160781
+        pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.")
+
     if "dino" in model_name:
         pytest.skip("DINO is not supported.")
 
@@ -118,6 +123,11 @@ def test_predict_with_explain(
         # TODO(Eugene): maskdino not support yet.
         pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.")
 
+    if "dfine" in model_name:
+        # TODO(Eugene): dfine not support yet.
+        # https://jira.devtools.intel.com/browse/CVS-160781
+        pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.")
+
     if "rtmdet_tiny" in recipe:
         # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651)
         pytest.skip("rtmdet_tiny on detection is not supported yet.")
diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index 649bbf6a4c1..0de2a490929 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -252,6 +252,9 @@ def test_otx_e2e(
     if "dino" in model_name:
         return  # DINO is not supported.
 
+    if "dfine" in model_name:
+        return  # DFine is not supported.
+
     if "rtdetr" in model_name:
         return  # RT-DETR currently is not supported.
 
@@ -331,6 +334,9 @@ def test_otx_explain_e2e(
     if "dino" in model_name:
         pytest.skip("DINO is not supported.")
 
+    if "dfine" in model_name:
+        pytest.skip("DFine is not supported.")
+
     if "maskrcnn_r50_tv" in model_name:
         pytest.skip("MaskRCNN R50 Torchvision model doesn't support explain.")
     elif "rtdetr" in recipe:
diff --git a/tests/unit/algo/detection/test_dfine.py b/tests/unit/algo/detection/test_dfine.py
new file mode 100644
index 00000000000..2d025849842
--- /dev/null
+++ b/tests/unit/algo/detection/test_dfine.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of D-Fine."""
+
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+import torchvision
+from otx.algo.detection.backbones.hgnetv2 import HGNetv2
+from otx.algo.detection.d_fine import DFine
+from otx.algo.detection.heads.dfine_decoder import DFINETransformer
+from otx.algo.detection.losses.dfine_loss import DFINECriterion
+from otx.algo.detection.necks.dfine_hybrid_encoder import HybridEncoder
+from otx.algo.detection.rtdetr import DETR
+from otx.core.data.entity.detection import DetBatchPredEntity
+
+
+class TestDFine:
+    @pytest.mark.parametrize(
+        "model",
+        [
+            DFine(label_info=3, model_name="dfine_hgnetv2_x"),
+        ],
+    )
+    def test_loss(self, model, fxt_data_module):
+        data = next(iter(fxt_data_module.train_dataloader()))
+        data.images = torch.randn([2, 3, 640, 640])
+        model(data)
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            DFine(label_info=3, model_name="dfine_hgnetv2_x"),
+        ],
+    )
+    def test_predict(self, model, fxt_data_module):
+        data = next(iter(fxt_data_module.train_dataloader()))
+        data.images = torch.randn(2, 3, 640, 640)
+        model.eval()
+        output = model(data)
+        assert isinstance(output, DetBatchPredEntity)
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            DFine(label_info=3, model_name="dfine_hgnetv2_x"),
+        ],
+    )
+    def test_export(self, model):
+        model.eval()
+        output = model.forward_for_tracing(torch.randn(1, 3, 640, 640))
+        assert len(output) == 3
+
+    @pytest.fixture()
+    def dfine_model(self):
+        num_classes = 10
+        model_name = "dfine_hgnetv2_x"
+        backbone = HGNetv2(model_name=model_name)
+        encoder = HybridEncoder(model_name=model_name)
+        decoder = DFINETransformer(
+            model_name=model_name,
+            num_classes=num_classes,
+        )
+        criterion = DFINECriterion(
+            weight_dict={
+                "loss_vfl": 1,
+                "loss_bbox": 5,
+                "loss_giou": 2,
+                "loss_fgl": 0.15,
+                "loss_ddf": 1.5,
+            },
+            alpha=0.75,
+            gamma=2.0,
+            reg_max=32,
+            num_classes=num_classes,
+        )
+        return DETR(backbone=backbone, encoder=encoder, decoder=decoder, num_classes=10, criterion=criterion)
+
+    @pytest.fixture()
+    def targets(self):
+        return [
+            {
+                "boxes": torch.tensor([[0.2739, 0.2848, 0.3239, 0.3348], [0.1652, 0.1109, 0.2152, 0.1609]]),
+                "labels": torch.tensor([2, 2]),
+            },
+            {
+                "boxes": torch.tensor(
+                    [
+                        [0.6761, 0.8174, 0.7261, 0.8674],
+                        [0.1652, 0.1109, 0.2152, 0.1609],
+                        [0.2848, 0.9370, 0.3348, 0.9870],
+                    ],
+                ),
+                "labels": torch.tensor([8, 2, 7]),
+            },
+        ]
+
+    @pytest.fixture()
+    def images(self):
+        return torch.randn(2, 3, 640, 640)
+
+    def test_dfine_forward(self, dfine_model, images, targets):
+        dfine_model.train()
+        output = dfine_model(images, targets)
+        assert isinstance(output, dict)
+        for key in output:
+            assert key.startswith("loss_")
+        assert "loss_bbox" in output
+        assert "loss_vfl" in output
+        assert "loss_giou" in output
+
+    def test_dfine_postprocess(self, dfine_model):
+        outputs = {
+            "pred_logits": torch.randn(2, 100, 10),
+            "pred_boxes": torch.randn(2, 100, 4),
+        }
+        original_sizes = [[640, 640], [640, 640]]
+        result = dfine_model.postprocess(outputs, original_sizes)
+        assert isinstance(result, tuple)
+        assert len(result) == 3
+        scores, boxes, labels = result
+        assert isinstance(scores, list)
+        assert isinstance(boxes, list)
+        assert isinstance(boxes[0], torchvision.tv_tensors.BoundingBoxes)
+        assert boxes[0].canvas_size == original_sizes[0]
+        assert isinstance(labels, list)
+        assert len(scores) == 2
+        assert len(boxes) == 2
+        assert len(labels) == 2
+
+    def test_dfine_export(self, dfine_model, images):
+        dfine_model.eval()
+        dfine_model.num_top_queries = 10
+        batch_img_metas = [{"img_shape": (740, 740), "scale_factor": 1.0}]
+        result = dfine_model.export(images, batch_img_metas)
+        assert isinstance(result, dict)
+        assert "bboxes" in result
+        assert "labels" in result
+        assert "scores" in result
+        assert result["bboxes"].shape == (2, 10, 4)
+        # ensure no scaling
+        assert torch.all(result["bboxes"] < 2)
+
+    def test_set_input_size(self):
+        input_size = 1280
+        model = DETR(
+            backbone=MagicMock(),
+            encoder=MagicMock(),
+            decoder=MagicMock(),
+            num_classes=10,
+            input_size=input_size,
+        )
+
+        expected_multi_scale = sorted([input_size - i * 32 for i in range(-5, 6)] + [input_size] * 2)
+
+        assert sorted(model.multi_scale) == expected_multi_scale
diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py
index ba232509dd7..b5c791f70b2 100644
--- a/tests/unit/core/data/test_tiling.py
+++ b/tests/unit/core/data/test_tiling.py
@@ -124,6 +124,7 @@ def fxt_data_config(self, fxt_data_roots) -> dict[dict]:
             },
         }
 
+    @pytest.mark.intense()
     def det_dummy_forward(self, x: DetBatchDataEntity) -> DetBatchPredEntity:
         """Dummy detection forward function for testing.
 
@@ -178,6 +179,7 @@ def det_dummy_forward(self, x: DetBatchDataEntity) -> DetBatchPredEntity:
 
         return pred_entity
 
+    @pytest.mark.intense()
     def inst_seg_dummy_forward(self, x: InstanceSegBatchDataEntity) -> InstanceSegBatchPredEntity:
         """Dummy instance segmantation forward function for testing.
 
@@ -240,6 +242,7 @@ def inst_seg_dummy_forward(self, x: InstanceSegBatchDataEntity) -> InstanceSegBa
 
         return pred_entity
 
+    @pytest.mark.intense()
     @pytest.mark.parametrize(
         "task",
         [OTXTaskType.DETECTION, OTXTaskType.INSTANCE_SEGMENTATION, OTXTaskType.SEMANTIC_SEGMENTATION],
@@ -381,6 +384,7 @@ def test_tile_sampler(self, fxt_data_config):
 
             assert sampled_count == count, "Sampled count should be equal to the count of the dataloader batch size"
 
+    @pytest.mark.intense()
     def test_train_dataloader(self, fxt_data_config) -> None:
         for task, data_config in fxt_data_config.items():
             # Enable tile adapter
@@ -400,6 +404,7 @@ def test_train_dataloader(self, fxt_data_config) -> None:
                 else:
                     pytest.skip("Task not supported")
 
+    @pytest.mark.intense()
     def test_val_dataloader(self, fxt_data_config) -> None:
         for task, data_config in fxt_data_config.items():
             # Enable tile adapter
@@ -419,6 +424,7 @@ def test_val_dataloader(self, fxt_data_config) -> None:
                 else:
                     pytest.skip("Task not supported")
 
+    @pytest.mark.intense()
     def test_det_tile_merge(self, fxt_data_config):
         data_config = fxt_data_config[OTXTaskType.DETECTION]
         model = ATSS(
@@ -427,6 +433,8 @@ def test_det_tile_merge(self, fxt_data_config):
         )  # updated from OTXDetectionModel to avoid NotImplementedError in _build_model
         # Enable tile adapter
         data_config["tile_config"] = TileConfig(enable_tiler=True)
+        data_config["mem_cache_size"] = "0"
+        data_config["val_subset"].batch_size = 1
         tile_datamodule = OTXDataModule(
             task=OTXTaskType.DETECTION,
             **data_config,
@@ -439,6 +447,7 @@ def test_det_tile_merge(self, fxt_data_config):
         for batch in tile_datamodule.val_dataloader():
             model.forward_tiles(batch)
 
+    @pytest.mark.intense()
     def test_explain_det_tile_merge(self, fxt_data_config):
         data_config = fxt_data_config[OTXTaskType.DETECTION]
         model = ATSS(
@@ -447,6 +456,8 @@ def test_explain_det_tile_merge(self, fxt_data_config):
         )  # updated from OTXDetectionModel to avoid NotImplementedError in _build_model
         # Enable tile adapter
         data_config["tile_config"] = TileConfig(enable_tiler=True, enable_adaptive_tiling=False)
+        data_config["mem_cache_size"] = "0"
+        data_config["val_subset"].batch_size = 1
         tile_datamodule = OTXDataModule(
             task=OTXTaskType.DETECTION,
             **data_config,
@@ -461,6 +472,7 @@ def test_explain_det_tile_merge(self, fxt_data_config):
             assert prediction.saliency_map[0].ndim == 3
         self.explain_mode = False
 
+    @pytest.mark.intense()
     def test_instseg_tile_merge(self, fxt_data_config):
         data_config = fxt_data_config[OTXTaskType.INSTANCE_SEGMENTATION]
         model = MaskRCNN(label_info=3, model_name="maskrcnn_efficientnet_b2b", input_size=(256, 256))
@@ -480,6 +492,7 @@ def test_instseg_tile_merge(self, fxt_data_config):
         for batch in tile_datamodule.val_dataloader():
             model.forward_tiles(batch)
 
+    @pytest.mark.intense()
     def test_explain_instseg_tile_merge(self, fxt_data_config):
         data_config = fxt_data_config[OTXTaskType.INSTANCE_SEGMENTATION]
         model = MaskRCNN(label_info=3, model_name="maskrcnn_efficientnet_b2b", input_size=(256, 256))
@@ -501,11 +514,14 @@ def test_explain_instseg_tile_merge(self, fxt_data_config):
             assert prediction.saliency_map[0].ndim == 3
         self.explain_mode = False
 
+    @pytest.mark.intense()
     def test_seg_tile_merge(self, fxt_data_config):
         data_config = fxt_data_config[OTXTaskType.SEMANTIC_SEGMENTATION]
         model = LiteHRNet(label_info=3, model_name="lite_hrnet_18")
         # Enable tile adapter
         data_config["tile_config"] = TileConfig(enable_tiler=True)
+        data_config["mem_cache_size"] = "0"
+        data_config["val_subset"].batch_size = 1
         tile_datamodule = OTXDataModule(
             task=OTXTaskType.SEMANTIC_SEGMENTATION,
             **data_config,
@@ -517,6 +533,7 @@ def test_seg_tile_merge(self, fxt_data_config):
         for batch in tile_datamodule.val_dataloader():
             model.forward_tiles(batch)
 
+    @pytest.mark.intense()
     def test_seg_tiler(self, mocker):
         rng = np.random.default_rng()
         rnd_tile_size = rng.integers(low=100, high=500)
diff --git a/tox.ini b/tox.ini
index a4d8d7ac0db..e8b1431fb37 100644
--- a/tox.ini
+++ b/tox.ini
@@ -48,13 +48,19 @@ deps =
     .[base,dev]
 commands =
     ; Run Unit-Test with coverage report.
-    pytest tests/unit \
+    pytest -m "not intense" tests/unit \
         --cov=otx \
         --cov-report=xml:{toxworkdir}/coverage_{envname}.xml \
         --cov-report=term-missing \
         --cov-fail-under=0 \
         {posargs}
 
+[testenv:intense-unit-test-{py310, py311}]
+deps =
+    .[base,dev]
+commands =
+    pytest -m "intense" tests/unit {posargs}
+
 
 [testenv:integration-test-{all, action, classification, multi_cls_classification, multi_label_classification, hlabel_classification, detection, rotated_detection, keypoint_detection, instance_segmentation, semantic_segmentation, visual_prompting_all, visual_prompting, zero_shot_visual_prompting, anomaly, anomaly_classification, anomaly_detection, anomaly_segmentation, object_detection_3d}]
 setenv =

From e4b234fe64dca001db57dd39c3f19a839473a238 Mon Sep 17 00:00:00 2001
From: Alexander Barabanov
 <97449232+AlexanderBarabanov@users.noreply.github.com>
Date: Fri, 17 Jan 2025 14:25:30 +0000
Subject: [PATCH 3/9] Bump download-artifact (#4179)

bump download-artifact
---
 .github/workflows/perf_benchmark.yaml | 2 +-
 .github/workflows/publish.yaml        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml
index f5297c1a640..4de6b98e289 100644
--- a/.github/workflows/perf_benchmark.yaml
+++ b/.github/workflows/perf_benchmark.yaml
@@ -172,7 +172,7 @@ jobs:
           python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt
           rm /tmp/requirements.txt
       - name: Download benchmark results
-        uses: actions/download-artifact@87c55149d96e628cc2ef7e6fc2aab372015aec85 # v4.1.3
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           path: tests/perf/history/latest
       - name: Summarize benchamrk results
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 5adc23c3714..11dc49ebdb2 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -49,7 +49,7 @@ jobs:
       id-token: write
     steps:
       - name: Download artifacts
-        uses: actions/download-artifact@87c55149d96e628cc2ef7e6fc2aab372015aec85 # v4.1.3
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
         with:
           path: dist
           pattern: artifact-*

From c3ce3618869f6fdaf78c34055e6fa7d9ff4c6f82 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 20 Jan 2025 09:32:30 +0100
Subject: [PATCH 4/9] Bump diffusers from 0.32.1 to 0.32.2 in the pip-others
 group (#4178)

Bumps the pip-others group with 1 update: [diffusers](https://github.com/huggingface/diffusers).


Updates `diffusers` from 0.32.1 to 0.32.2
- [Release notes](https://github.com/huggingface/diffusers/releases)
- [Commits](https://github.com/huggingface/diffusers/compare/v0.32.1...v0.32.2)

---
updated-dependencies:
- dependency-name: diffusers
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: pip-others
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Vladislav Sovrasov <sovrasov.vlad@gmail.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fa4b942ff1b..48ce9f40767 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,7 +90,7 @@ base = [
 
 transformers = [
     "transformers==4.48.0",
-    "diffusers==0.32.1",
+    "diffusers==0.32.2",
     "torchmetrics[image]>=0.7.0"
 ]
 

From 7ca826ad859dea910a8822ae15e2292e48257be6 Mon Sep 17 00:00:00 2001
From: Vladislav Sovrasov <sovrasov.vlad@gmail.com>
Date: Mon, 20 Jan 2025 10:24:42 +0100
Subject: [PATCH 5/9] Update keypoint metric name in benchmark (#4180)

---
 tests/perf/test_keypoint_detection.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/perf/test_keypoint_detection.py b/tests/perf/test_keypoint_detection.py
index 1ff150a03d6..8786f530445 100644
--- a/tests/perf/test_keypoint_detection.py
+++ b/tests/perf/test_keypoint_detection.py
@@ -47,10 +47,10 @@ class TestPerfKeypointDetection(PerfTestBase):
     BENCHMARK_CRITERIA = [  # noqa: RUF012
         Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
         Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
-        Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="val/PCK", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/PCK", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/PCK", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/PCK", summary="max", compare=">", margin=0.1),
         Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
@@ -120,10 +120,10 @@ class TestPerfKeypointDetectionSingleObj(PerfTestBase):
     BENCHMARK_CRITERIA = [  # noqa: RUF012
         Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
         Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
-        Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="val/PCK", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/PCK", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/PCK", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/PCK", summary="max", compare=">", margin=0.1),
         Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),

From 4416ac42019e6a7e596e4d02a2289b55fd8f0316 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 22 Jan 2025 16:40:46 +0100
Subject: [PATCH 6/9] Bump the pip-others group across 1 directory with 4
 updates (#4185)

Bumps the pip-others group with 4 updates in the / directory: [numba](https://github.com/numba/numba), [tox](https://github.com/tox-dev/tox), [pre-commit](https://github.com/pre-commit/pre-commit) and [transformers](https://github.com/huggingface/transformers).


Updates `numba` from 0.60.0 to 0.61.0
- [Release notes](https://github.com/numba/numba/releases)
- [Commits](https://github.com/numba/numba/compare/0.60.0...0.61.0)

Updates `tox` from 4.23.2 to 4.24.1
- [Release notes](https://github.com/tox-dev/tox/releases)
- [Changelog](https://github.com/tox-dev/tox/blob/main/docs/changelog.rst)
- [Commits](https://github.com/tox-dev/tox/compare/4.23.2...4.24.1)

Updates `pre-commit` from 4.0.1 to 4.1.0
- [Release notes](https://github.com/pre-commit/pre-commit/releases)
- [Changelog](https://github.com/pre-commit/pre-commit/blob/main/CHANGELOG.md)
- [Commits](https://github.com/pre-commit/pre-commit/compare/v4.0.1...v4.1.0)

Updates `transformers` from 4.48.0 to 4.48.1
- [Release notes](https://github.com/huggingface/transformers/releases)
- [Commits](https://github.com/huggingface/transformers/compare/v4.48.0...v4.48.1)

---
updated-dependencies:
- dependency-name: numba
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: pip-others
- dependency-name: tox
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: pip-others
- dependency-name: pre-commit
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: pip-others
- dependency-name: transformers
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: pip-others
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 48ce9f40767..567dfbce811 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,15 +39,15 @@ dependencies = [
     "einops==0.8.0",
     "decord==0.6.0",
     "typeguard>=4.3,<4.5",
-    "numba==0.60.0",
+    "numba==0.61.0",
     # TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126
     "setuptools<70",
 ]
 
 [project.optional-dependencies]
 dev = [
-    "tox==4.23.2",
-    "pre-commit==4.0.1",
+    "tox==4.24.1",
+    "pre-commit==4.1.0",
     "pylint",
     "pytest",
     "coverage",
@@ -89,7 +89,7 @@ base = [
 ]
 
 transformers = [
-    "transformers==4.48.0",
+    "transformers==4.48.1",
     "diffusers==0.32.2",
     "torchmetrics[image]>=0.7.0"
 ]
@@ -106,7 +106,7 @@ mmlab = [
 ]
 
 ci_tox = [
-    "tox==4.23.2",
+    "tox==4.24.1",
 ]
 
 ci_publish = [

From 2f0b54b4c01a13cfaf442270883445f9a1c111a9 Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Thu, 23 Jan 2025 08:36:14 +0000
Subject: [PATCH 7/9] DETR XAI (#4184)

* Implement explainability features in DFine and RTDETR models
---
 CHANGELOG.md                                  |  2 +
 docs/source/guide/tutorials/base/explain.rst  |  4 +-
 src/otx/algo/detection/d_fine.py              | 56 ++++++++++++++++++
 .../detectors/detection_transformer.py        | 40 ++++++++++---
 src/otx/algo/detection/heads/dfine_decoder.py | 31 ++++++++--
 .../algo/detection/heads/rtdetr_decoder.py    | 35 ++++++++---
 src/otx/algo/detection/rtdetr.py              | 58 ++++++++++++++++++-
 tests/integration/api/test_xai.py             | 18 +-----
 tests/integration/cli/test_cli.py             | 13 +----
 9 files changed, 205 insertions(+), 52 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1a5dbbb26b6..e0578a1549b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/4017>)
 - Add D-Fine Detection Algorithm
   (<https://github.com/openvinotoolkit/training_extensions/pull/4142>)
+- Add DETR XAI Explain Mode
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4184>)
 
 ### Enhancements
 
diff --git a/docs/source/guide/tutorials/base/explain.rst b/docs/source/guide/tutorials/base/explain.rst
index bf2af135783..cb195b9a914 100644
--- a/docs/source/guide/tutorials/base/explain.rst
+++ b/docs/source/guide/tutorials/base/explain.rst
@@ -32,6 +32,7 @@ which are heatmaps with red-colored areas indicating focus. Here's an example ho
 
             (otx) ...$ otx explain --work_dir otx-workspace \
                                    --dump True # Wherether to save saliency map images or not
+                                   --explain_config.postprocess True # Resizes and applies colormap to the saliency map
 
     .. tab-item:: CLI (with config)
 
@@ -41,6 +42,7 @@ which are heatmaps with red-colored areas indicating focus. Here's an example ho
                                    --data_root data/wgisd \
                                    --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt \
                                    --dump True # Wherether to save saliency map images or not
+                                   --explain_config.postprocess True # Resizes and applies colormap to the saliency map
 
     .. tab-item:: API
 
@@ -49,7 +51,7 @@ which are heatmaps with red-colored areas indicating focus. Here's an example ho
             engine.explain(
                 checkpoint="<checkpoint-path>",
                 datamodule=OTXDataModule(...), # The data module to use for predictions
-                explain_config=ExplainConfig(postprocess=True),
+                explain_config=ExplainConfig(postprocess=True), # Resizes and applies colormap to the saliency map
                 dump=True # Wherether to save saliency map images or not
               )
 
diff --git a/src/otx/algo/detection/d_fine.py b/src/otx/algo/detection/d_fine.py
index 5e16aa9c3c7..717ea9d6b23 100644
--- a/src/otx/algo/detection/d_fine.py
+++ b/src/otx/algo/detection/d_fine.py
@@ -157,6 +157,9 @@ def _customize_inputs(
                 )
             targets.append({"boxes": scaled_bboxes, "labels": ll})
 
+        if self.explain_mode:
+            return {"entity": entity}
+
         return {
             "images": entity.images,
             "targets": targets,
@@ -185,6 +188,33 @@ def _customize_outputs(
         original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info]
         scores, bboxes, labels = self.model.postprocess(outputs, original_sizes)
 
+        if self.explain_mode:
+            if not isinstance(outputs, dict):
+                msg = f"Model output should be a dict, but got {type(outputs)}."
+                raise ValueError(msg)
+
+            if "feature_vector" not in outputs:
+                msg = "No feature vector in the model output."
+                raise ValueError(msg)
+
+            if "saliency_map" not in outputs:
+                msg = "No saliency maps in the model output."
+                raise ValueError(msg)
+
+            saliency_map = outputs["saliency_map"].detach().cpu().numpy()
+            feature_vector = outputs["feature_vector"].detach().cpu().numpy()
+
+            return DetBatchPredEntity(
+                batch_size=len(outputs),
+                images=inputs.images,
+                imgs_info=inputs.imgs_info,
+                scores=scores,
+                bboxes=bboxes,
+                labels=labels,
+                feature_vector=feature_vector,
+                saliency_map=saliency_map,
+            )
+
         return DetBatchPredEntity(
             batch_size=len(outputs),
             images=inputs.images,
@@ -306,3 +336,29 @@ def _optimization_config(self) -> dict[str, Any]:
                 },
             },
         }
+
+    @staticmethod
+    def _forward_explain_detection(
+        self,  # noqa: ANN001
+        entity: DetBatchDataEntity,
+        mode: str = "tensor",  # noqa: ARG004
+    ) -> dict[str, torch.Tensor]:
+        """Forward function for explainable detection model."""
+        backbone_feats = self.encoder(self.backbone(entity.images))
+        predictions = self.decoder(backbone_feats, explain_mode=True)
+
+        raw_logits = DETR.split_and_reshape_logits(
+            backbone_feats,
+            predictions["raw_logits"],
+        )
+
+        saliency_map = self.explain_fn(raw_logits)
+        feature_vector = self.feature_vector_fn(backbone_feats)
+        predictions.update(
+            {
+                "feature_vector": feature_vector,
+                "saliency_map": saliency_map,
+            },
+        )
+
+        return predictions
diff --git a/src/otx/algo/detection/detectors/detection_transformer.py b/src/otx/algo/detection/detectors/detection_transformer.py
index d6798f1d426..f3cda5b7417 100644
--- a/src/otx/algo/detection/detectors/detection_transformer.py
+++ b/src/otx/algo/detection/detectors/detection_transformer.py
@@ -1,11 +1,10 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """Base DETR model implementations."""
 
 from __future__ import annotations
 
-import warnings
 from typing import Any
 
 import numpy as np
@@ -96,22 +95,47 @@ def export(
         explain_mode: bool = False,
     ) -> dict[str, Any] | tuple[list[Any], list[Any], list[Any]]:
         """Exports the model."""
+        backbone_feats = self.encoder(self.backbone(batch_inputs))
+        predictions = self.decoder(backbone_feats, explain_mode=True)
         results = self.postprocess(
-            self._forward_features(batch_inputs),
+            predictions,
             [meta["img_shape"] for meta in batch_img_metas],
             deploy_mode=True,
         )
-
         if explain_mode:
-            # TODO(Eugene): Implement explain mode for DETR model.
-            warnings.warn("Explain mode is not supported for DETR model. Return dummy values.", stacklevel=2)
+            raw_logits = self.split_and_reshape_logits(backbone_feats, predictions["raw_logits"])
+            feature_vector = self.feature_vector_fn(backbone_feats)
+            saliency_map = self.explain_fn(raw_logits)
             xai_output = {
-                "feature_vector": torch.zeros(1, 1),
-                "saliency_map": torch.zeros(1),
+                "feature_vector": feature_vector,
+                "saliency_map": saliency_map,
             }
             results.update(xai_output)  # type: ignore[union-attr]
         return results
 
+    @staticmethod
+    def split_and_reshape_logits(
+        backbone_feats: tuple[Tensor, ...],
+        raw_logits: Tensor,
+    ) -> tuple[Tensor, ...]:
+        """Splits and reshapes raw logits for explain mode.
+
+        Args:
+            backbone_feats (tuple[Tensor,...]): Tuple of backbone features.
+            raw_logits (Tensor): Raw logits.
+
+        Returns:
+            tuple[Tensor,...]: The reshaped logits.
+        """
+        splits = [f.shape[-2] * f.shape[-1] for f in backbone_feats]
+        # Permute and split logits in one line
+        raw_logits = torch.split(raw_logits.permute(0, 2, 1), splits, dim=-1)
+
+        # Reshape each split in a list comprehension
+        return tuple(
+            logits.reshape(f.shape[0], -1, f.shape[-2], f.shape[-1]) for logits, f in zip(raw_logits, backbone_feats)
+        )
+
     def postprocess(
         self,
         outputs: dict[str, Tensor],
diff --git a/src/otx/algo/detection/heads/dfine_decoder.py b/src/otx/algo/detection/heads/dfine_decoder.py
index d28e0cf3864..e2d8f9dd663 100644
--- a/src/otx/algo/detection/heads/dfine_decoder.py
+++ b/src/otx/algo/detection/heads/dfine_decoder.py
@@ -723,7 +723,7 @@ def _get_decoder_input(
             enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1)
             content = torch.concat([denoising_logits, content], dim=1)
 
-        return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list
+        return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list, enc_outputs_logits
 
     def _select_topk(
         self,
@@ -762,8 +762,22 @@ def _select_topk(
 
         return topk_memory, topk_logits, topk_anchors
 
-    def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None) -> dict[str, Tensor]:
-        """Forward pass of the DFine Transformer module."""
+    def forward(
+        self,
+        feats: Tensor,
+        targets: list[dict[str, Tensor]] | None = None,
+        explain_mode: bool = False,
+    ) -> dict[str, Tensor]:
+        """Forward function of the D-FINE Decoder Transformer Module.
+
+        Args:
+            feats (Tensor): Feature maps.
+            targets (list[dict[str, Tensor]] | None, optional): target annotations. Defaults to None.
+            explain_mode (bool, optional): Whether to return raw logits for explanation. Defaults to False.
+
+        Returns:
+            dict[str, Tensor]: Output dictionary containing predicted logits, losses and boxes.
+        """
         # input projection and embedding
         memory, spatial_shapes = self._get_encoder_input(feats)
 
@@ -781,7 +795,13 @@ def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None)
         else:
             denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
 
-        init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = self._get_decoder_input(
+        (
+            init_ref_contents,
+            init_ref_points_unact,
+            enc_topk_bboxes_list,
+            enc_topk_logits_list,
+            raw_logits,
+        ) = self._get_decoder_input(
             memory,
             spatial_shapes,
             denoising_logits,
@@ -858,6 +878,9 @@ def forward(self, feats: Tensor, targets: list[dict[str, Tensor]] | None = None)
                 "pred_boxes": out_bboxes[-1],
             }
 
+        if explain_mode:
+            out["raw_logits"] = raw_logits
+
         return out
 
     @torch.jit.unused
diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py
index dd5cf2f1991..bf140675ef7 100644
--- a/src/otx/algo/detection/heads/rtdetr_decoder.py
+++ b/src/otx/algo/detection/heads/rtdetr_decoder.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """RTDETR decoder, modified from https://github.com/lyuwenyu/RT-DETR."""
@@ -546,10 +546,10 @@ def _get_decoder_input(
 
         output_memory = self.enc_output(memory)
 
-        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_logits = self.enc_score_head(output_memory)
         enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
 
-        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
+        _, topk_ind = torch.topk(enc_outputs_logits.max(-1).values, self.num_queries, dim=1)
 
         reference_points_unact = enc_outputs_coord_unact.gather(
             dim=1,
@@ -560,9 +560,9 @@ def _get_decoder_input(
         if denoising_bbox_unact is not None:
             reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
 
-        enc_topk_logits = enc_outputs_class.gather(
+        enc_topk_logits = enc_outputs_logits.gather(
             dim=1,
-            index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]),
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_logits.shape[-1]),
         )
 
         # extract region features
@@ -575,10 +575,24 @@ def _get_decoder_input(
         if denoising_class is not None:
             target = torch.concat([denoising_class, target], 1)
 
-        return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits
+        return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits, enc_outputs_logits
 
-    def forward(self, feats: torch.Tensor, targets: list[dict[str, torch.Tensor]] | None = None) -> torch.Tensor:
-        """Forward pass of the RTDETRTransformer module."""
+    def forward(
+        self,
+        feats: torch.Tensor,
+        targets: list[dict[str, torch.Tensor]] | None = None,
+        explain_mode: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """Forward function of RTDETRTransformer.
+
+        Args:
+            feats (Tensor): Input features.
+            targets (List[Dict[str, Tensor]]): List of target dictionaries.
+            explain_mode (bool): Whether to return raw logits for explanation.
+
+        Returns:
+            dict[str, Tensor]: Output dictionary containing predicted logits, losses and boxes.
+        """
         # input projection and embedding
         (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats)
 
@@ -596,7 +610,7 @@ def forward(self, feats: torch.Tensor, targets: list[dict[str, torch.Tensor]] |
         else:
             denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
 
-        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = self._get_decoder_input(
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits, raw_logits = self._get_decoder_input(
             memory,
             spatial_shapes,
             denoising_class,
@@ -630,6 +644,9 @@ def forward(self, feats: torch.Tensor, targets: list[dict[str, torch.Tensor]] |
                 out["dn_aux_outputs"] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
                 out["dn_meta"] = dn_meta
 
+        if explain_mode:
+            out["raw_logits"] = raw_logits
+
         return out
 
     @torch.jit.unused
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 87784dadd7a..fcbf6330c2e 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """RTDetr model implementations."""
@@ -128,6 +128,9 @@ def _customize_inputs(
                 )
             targets.append({"boxes": scaled_bboxes, "labels": ll})
 
+        if self.explain_mode:
+            return {"entity": entity}
+
         return {
             "images": entity.images,
             "targets": targets,
@@ -156,6 +159,33 @@ def _customize_outputs(
         original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info]
         scores, bboxes, labels = self.model.postprocess(outputs, original_sizes)
 
+        if self.explain_mode:
+            if not isinstance(outputs, dict):
+                msg = f"Model output should be a dict, but got {type(outputs)}."
+                raise ValueError(msg)
+
+            if "feature_vector" not in outputs:
+                msg = "No feature vector in the model output."
+                raise ValueError(msg)
+
+            if "saliency_map" not in outputs:
+                msg = "No saliency maps in the model output."
+                raise ValueError(msg)
+
+            saliency_map = outputs["saliency_map"].detach().cpu().numpy()
+            feature_vector = outputs["feature_vector"].detach().cpu().numpy()
+
+            return DetBatchPredEntity(
+                batch_size=len(outputs),
+                images=inputs.images,
+                imgs_info=inputs.imgs_info,
+                scores=scores,
+                bboxes=bboxes,
+                labels=labels,
+                feature_vector=feature_vector,
+                saliency_map=saliency_map,
+            )
+
         return DetBatchPredEntity(
             batch_size=len(outputs),
             images=inputs.images,
@@ -271,3 +301,29 @@ def _exporter(self) -> OTXModelExporter:
     def _optimization_config(self) -> dict[str, Any]:
         """PTQ config for RT-DETR."""
         return {"model_type": "transformer"}
+
+    @staticmethod
+    def _forward_explain_detection(
+        self,  # noqa: ANN001
+        entity: DetBatchDataEntity,
+        mode: str = "tensor",  # noqa: ARG004
+    ) -> dict[str, torch.Tensor]:
+        """Forward function for explainable detection model."""
+        backbone_feats = self.encoder(self.backbone(entity.images))
+        predictions = self.decoder(backbone_feats, explain_mode=True)
+
+        raw_logits = DETR.split_and_reshape_logits(
+            backbone_feats,
+            predictions["raw_logits"],
+        )
+
+        saliency_map = self.explain_fn(raw_logits)
+        feature_vector = self.feature_vector_fn(backbone_feats)
+        predictions.update(
+            {
+                "feature_vector": feature_vector,
+                "saliency_map": saliency_map,
+            },
+        )
+
+        return predictions
diff --git a/tests/integration/api/test_xai.py b/tests/integration/api/test_xai.py
index 5408cd5049e..d82723470ec 100644
--- a/tests/integration/api/test_xai.py
+++ b/tests/integration/api/test_xai.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from pathlib import Path
@@ -51,11 +51,6 @@ def test_forward_explain(
         # TODO(Eugene): maskdino not support yet.
         pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.")
 
-    if "dfine" in model_name:
-        # TODO(Eugene): dfine not support yet.
-        # https://jira.devtools.intel.com/browse/CVS-160781
-        pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.")
-
     if "dino" in model_name:
         pytest.skip("DINO is not supported.")
 
@@ -63,9 +58,6 @@ def test_forward_explain(
         # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651)
         pytest.skip("rtmdet_tiny on detection is not supported yet.")
 
-    if "rtdetr" in recipe:
-        pytest.skip("rtdetr on detection is not supported yet.")
-
     if "yolov9" in recipe:
         pytest.skip("yolov9 on detection is not supported yet.")
 
@@ -123,11 +115,6 @@ def test_predict_with_explain(
         # TODO(Eugene): maskdino not support yet.
         pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.")
 
-    if "dfine" in model_name:
-        # TODO(Eugene): dfine not support yet.
-        # https://jira.devtools.intel.com/browse/CVS-160781
-        pytest.skip(f"There's issue with dfine: {model_name}. Skip for now.")
-
     if "rtmdet_tiny" in recipe:
         # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651)
         pytest.skip("rtmdet_tiny on detection is not supported yet.")
@@ -136,9 +123,6 @@ def test_predict_with_explain(
         # TODO (Galina): required to update model-api to 2.1
         pytest.skip("yolox_tiny_tile on detection requires model-api update")
 
-    if "rtdetr" in recipe:
-        pytest.skip("rtdetr on detection is not supported yet.")
-
     if "yolov9" in recipe:
         pytest.skip("yolov9 on detection is not supported yet.")
 
diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index 0de2a490929..3c11993ddab 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
@@ -252,12 +252,6 @@ def test_otx_e2e(
     if "dino" in model_name:
         return  # DINO is not supported.
 
-    if "dfine" in model_name:
-        return  # DFine is not supported.
-
-    if "rtdetr" in model_name:
-        return  # RT-DETR currently is not supported.
-
     if "yolov9" in model_name:
         return  # RT-DETR currently is not supported.
 
@@ -334,13 +328,8 @@ def test_otx_explain_e2e(
     if "dino" in model_name:
         pytest.skip("DINO is not supported.")
 
-    if "dfine" in model_name:
-        pytest.skip("DFine is not supported.")
-
     if "maskrcnn_r50_tv" in model_name:
         pytest.skip("MaskRCNN R50 Torchvision model doesn't support explain.")
-    elif "rtdetr" in recipe:
-        pytest.skip("rtdetr model is not supported yet with explain.")
     elif "keypoint" in recipe:
         pytest.skip("keypoint detection models don't support explain.")
     elif "yolov9" in recipe:

From 24b0b9f2a9837e3d80dd58edfcf833f6f4cd7f52 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 23 Jan 2025 15:22:10 +0100
Subject: [PATCH 8/9] Bump mlflow from 2.19.0 to 2.20.0 in the pip-others group
 (#4186)

Bumps the pip-others group with 1 update: [mlflow](https://github.com/mlflow/mlflow).


Updates `mlflow` from 2.19.0 to 2.20.0
- [Release notes](https://github.com/mlflow/mlflow/releases)
- [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md)
- [Commits](https://github.com/mlflow/mlflow/compare/v2.19.0...v2.20.0)

---
updated-dependencies:
- dependency-name: mlflow
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: pip-others
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 567dfbce811..8c2438730f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -120,7 +120,7 @@ ci_benchmark = [
     "ipython==8.31.0",
     "ipykernel==6.29.5",
     "openpyxl==3.1.5",
-    "mlflow==2.19.0",
+    "mlflow==2.20.0",
     "py-cpuinfo==9.0.0",
 ]
 

From 4c99b6f37c5bc6569038e6b5681c511ebb95095b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 28 Jan 2025 15:53:38 +0100
Subject: [PATCH 9/9] Bump nbconvert from 7.16.5 to 7.16.6 in the pip-others
 group (#4190)

Bumps the pip-others group with 1 update: [nbconvert](https://github.com/jupyter/nbconvert).


Updates `nbconvert` from 7.16.5 to 7.16.6
- [Release notes](https://github.com/jupyter/nbconvert/releases)
- [Changelog](https://github.com/jupyter/nbconvert/blob/main/CHANGELOG.md)
- [Commits](https://github.com/jupyter/nbconvert/compare/v7.16.5...v7.16.6)

---
updated-dependencies:
- dependency-name: nbconvert
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: pip-others
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8c2438730f1..3fa45493763 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,7 +116,7 @@ ci_publish = [
 ci_benchmark = [
     "pandas<2.3",       # To avoid conflict with nncf==2.9.0
     "matplotlib==3.10.0",
-    "nbconvert==7.16.5",
+    "nbconvert==7.16.6",
     "ipython==8.31.0",
     "ipykernel==6.29.5",
     "openpyxl==3.1.5",