open-edge-platform
diff --git a/‎.github/workflows/perf_benchmark.yaml
Lines changed: 47 additions & 1 deletion b/‎.github/workflows/perf_benchmark.yaml
Lines changed: 47 additions & 1 deletion
diff --git a/‎.github/workflows/weekly.yaml
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/weekly.yaml
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/perf/benchmark.py
Lines changed: 38 additions & 12 deletions b/‎tests/perf/benchmark.py
Lines changed: 38 additions & 12 deletions
diff --git a/‎tests/perf/conftest.py
Lines changed: 5 additions & 3 deletions b/‎tests/perf/conftest.py
Lines changed: 5 additions & 3 deletions
@@ -7,6 +7,9 @@ on:
         type: choice
         description: Model category to run benchmark
         options:
+          - speed
+          - balance
+          - accuracy
           - default # speed, balance, accuracy models only
           - all # default + other models
         default: default
@@ -50,6 +53,45 @@ on:
           `pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run,
           and reverted after run. Works only for v2.x assuming CLI compatibility.
         default: __CURRENT_BRANCH_COMMIT__
+  workflow_call:
+    inputs:
+      model-category:
+        type: string
+        description: Model category to run benchmark [speed, balance, accuracy, default, all]
+        default: default
+      data-group:
+        type: string
+        description: Data group to run benchmark [small, medium, large, all]
+        default: all
+      num-repeat:
+        type: number
+        description: Overrides default per-data-group number of repeat setting
+        default: 0
+      num-epoch:
+        type: number
+        description: Overrides default per-model number of epoch setting
+        default: 0
+      eval-upto:
+        type: string
+        description: The last operation to evaluate. 'optimize' means all. [train, export, optimize]
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      data-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: "/home/validation/data/v2/"
+      otx-ref:
+        type: string
+        description: |
+          Target OTX ref (tag / branch name / commit hash) on main repo to test. Defaults to the current branch.
+          `pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run,
+          and reverted after run. Works only for v2.x assuming CLI compatibility.
+        default: __CURRENT_BRANCH_COMMIT__
 
 # Declare default permissions as read only.
 permissions: read-all
@@ -73,7 +115,7 @@ jobs:
           - task-short: "vsp"
             task: "visual_prompting"
     name: Perf-Benchmark-${{ matrix.task-short }}
-    runs-on: [self-hosted, linux, x64, dmount-v2, perf]
+    runs-on: [self-hosted, linux, x64, dmount-v2]
     timeout-minutes: 8640
     steps:
       - name: Checkout repository
@@ -85,6 +127,10 @@ jobs:
       - name: Install tox
         run: python -m pip install --require-hashes --no-deps -r .ci/tox-deps.txt
       - name: Run Performance Test
+        env:
+          BENCHMARK_RESULTS_CLEAR: ${{ vars.BENCHMARK_RESULTS_CLEAR }}
+          GH_CTX_REF_NAME: ${{ github.ref_name }}
+          GH_CTX_SHA: ${{ github.sha }}
         run: >
           tox -vv -e perf-benchmark -- tests/perf/test_${{ matrix.task }}.py ${{ inputs.pytest-args }}
           --model-category ${{ inputs.model-category }}
 
@@ -19,4 +19,3 @@ jobs:
       num-repeat: 0
       num-epoch: 0
       eval-upto: optimize
-      artifact-prefix: weekly-perf-benchmark
@@ -56,8 +56,6 @@ class Dataset:
         name: str
         path: Path
         group: str
-        data_format: str
-        num_classes: int
         num_repeat: int = 1
         extra_overrides: dict | None = None
 
@@ -155,10 +153,6 @@ def run(
                 str(data_root),
                 "--work_dir",
                 str(sub_work_dir),
-                "--model.num_classes",
-                str(dataset.num_classes),
-                "--data.config.data_format",
-                dataset.data_format,
                 "--engine.device",
                 self.accelerator,
             ]
@@ -172,7 +166,10 @@ def run(
             start_time = time()
             self._run_command(command)
             extra_metrics = {"train/e2e_time": time() - start_time}
-            self._rename_raw_data(work_dir=sub_work_dir / ".latest" / "train", replaces={"epoch": "train/epoch"})
+            self._rename_raw_data(
+                work_dir=sub_work_dir / ".latest" / "train",
+                replaces={"train_": "train/", "{pre}": "train/"},
+            )
             self._log_metrics(
                 work_dir=sub_work_dir / ".latest" / "train",
                 tags=tags,
@@ -187,6 +184,10 @@ def run(
                 str(sub_work_dir),
             ]
             self._run_command(command)
+            self._rename_raw_data(
+                work_dir=sub_work_dir / ".latest" / "test",
+                replaces={"test_": "test/", "{pre}": "test/"},
+            )
             self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria)
 
             # Export & test
@@ -215,7 +216,10 @@ def run(
                 ]
                 self._run_command(command)
 
-                self._rename_raw_data(work_dir=sub_work_dir / ".latest" / "test", replaces={"test": "export"})
+                self._rename_raw_data(
+                    work_dir=sub_work_dir / ".latest" / "test",
+                    replaces={"test": "export", "{pre}": "export/"},
+                )
                 self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria)
 
             # Optimize & test
@@ -250,7 +254,10 @@ def run(
                 ]
                 self._run_command(command)
 
-                self._rename_raw_data(work_dir=sub_work_dir / ".latest" / "test", replaces={"test": "optimize"})
+                self._rename_raw_data(
+                    work_dir=sub_work_dir / ".latest" / "test",
+                    replaces={"test": "optimize", "{pre}": "optimize/"},
+                )
                 self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria)
 
             # Force memory clean up
@@ -310,11 +317,25 @@ def _log_metrics(
         metrics.to_csv(work_dir / "benchmark.raw.csv", index=False)
 
     def _rename_raw_data(self, work_dir: Path, replaces: dict[str, str]) -> None:
+        replaces = {**self.NAME_MAPPING, **replaces}
+
+        def _rename_col(col_name: str) -> str:
+            for src_str, dst_str in replaces.items():
+                if src_str == "{pre}":
+                    if not col_name.startswith(dst_str):
+                        col_name = dst_str + col_name
+                elif src_str == "{post}":
+                    if not col_name.endswith(dst_str):
+                        col_name = col_name + dst_str
+                else:
+                    col_name = col_name.replace(src_str, dst_str)
+            return col_name
+
         csv_files = work_dir.glob("**/metrics.csv")
         for csv_file in csv_files:
             data = pd.read_csv(csv_file)
-            for src_str, dst_str in replaces.items():
-                data.columns = data.columns.str.replace(src_str, dst_str)
+            data = data.rename(columns=_rename_col)  # Column names
+            data = data.replace(replaces)  # Values
             data.to_csv(csv_file, index=False)
 
     @staticmethod
@@ -338,7 +359,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None:
         return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"])
 
     @staticmethod
-    def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame:
+    def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None:
         """Average result w.r.t. given keys
 
         Args:
@@ -348,6 +369,9 @@ def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame:
         Retruns:
             pd.DataFrame: Averaged result table
         """
+        if data is None:
+            return None
+
         # Flatten index
         index_names = data.index.names
         column_names = data.columns
@@ -391,3 +415,5 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]):
 
             for criterion in criteria:
                 criterion(result_entry, target_entry)
+
+    NAME_MAPPING: dict[str, str] = {}  # noqa: RUF012
@@ -27,8 +27,8 @@ def pytest_addoption(parser):
         "--model-category",
         action="store",
         default="all",
-        choices=("default", "all"),
-        help="Choose default|all. Defaults to all.",
+        choices=("speed", "balance", "accuracy", "default", "other", "all"),
+        help="Choose speed|balcence|accuracy|default|other|all. Defaults to all.",
     )
     parser.addoption(
         "--data-group",
@@ -290,7 +290,9 @@ def fxt_mlflow_client(request: pytest.FixtureRequest) -> MlflowClient:
 def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.Model:
     """Skip models according to user options."""
     model: Benchmark.Model = request.param
-    if fxt_model_category == "default" and model.category == "other":
+    if fxt_model_category == "all":
+        return model
+    if (fxt_model_category == "default" and model.category == "other") or fxt_model_category != model.category:
         pytest.skip(f"{model.category} category model")
     return model