iree-org
diff --git a/.github/workflows/benchmark_xla.yml b/.github/workflows/benchmark_xla.yml
diff --git a/.github/workflows/run_comparative_benchmarks.yml b/.github/workflows/run_comparative_benchmarks.yml
@@ -0,0 +1,102 @@
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Comparative Benchmarks Workflow.
+
+name: Comparative Benchmarks
+
+on:
+  schedule:
+    # Scheduled to run at 09:00 UTC and 21:00 UTC.
+    - cron: '0 09,21 * * *'
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  benchmark_gpu:
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=presubmit
+      - environment=testing
+      - gpu
+      - os-family=Linux
+    env:
+      TF_VERSION: 2.12.0
+      LOCAL_OUTPUT_DIR: results-dir
+      TF_RESULTS_JSON: tf-xla.json
+      JAX_RESULTS_JSON: jax-xla.json
+      BENCHMARK_DEVICE: gpu
+      GCS_UPLOAD_ROOT_DIR: "gs://comparative-benchmark-artifacts"
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Setup"
+        run: |
+          echo "GCS_UPLOAD_DIR=${GCS_UPLOAD_ROOT_DIR}/${BENCHMARK_DEVICE}_$(date +'%Y-%m-%d').$(date +'%s')" >> $GITHUB_ENV
+          mkdir "${LOCAL_OUTPUT_DIR}"
+      - name: "Benchmarking TF/XLA:GPU"
+        run: |
+          RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${TF_RESULTS_JSON}"
+          docker build --file "oobi/build_tools/docker/dockerfiles/tensorflow${TF_VERSION}-cuda11.8-cudnn8.9.Dockerfile" \
+            --tag "tensorflow${TF_VERSION}-cuda11.8-cudnn8.9" "oobi/build_tools/docker/context"
+          docker run --gpus all --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "tensorflow${TF_VERSION}-cuda11.8-cudnn8.9:latest" \
+            ./iree-tf/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${TF_VERSION}" "${RESULTS_PATH}"
+          cat "${RESULTS_PATH}"
+          gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
+      - name: "Benchmarking JAX/XLA:GPU"
+        run: |
+          RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${JAX_RESULTS_JSON}"
+          docker build --file "oobi/build_tools/docker/dockerfiles/cuda11.8-cudnn8.9.Dockerfile" \
+            --tag "cuda11.8-cudnn8.9" "oobi/build_tools/docker/context"
+          docker run --gpus all --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "cuda11.8-cudnn8.9:latest" \
+            ./iree-jax/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${RESULTS_PATH}"
+          cat "${RESULTS_PATH}"
+          gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
+          
+  
+  benchmark_cpu:
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=presubmit
+      - environment=testing
+      - machine-type=c2-standard-16
+      - os-family=Linux
+    env:
+      TF_VERSION: 2.12.0
+      LOCAL_OUTPUT_DIR: results-dir
+      TF_RESULTS_JSON: tf-xla.json
+      JAX_RESULTS_JSON: jax-xla.json
+      BENCHMARK_DEVICE: cpu
+      GCS_UPLOAD_ROOT_DIR: "gs://comparative-benchmark-artifacts"
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Setup"
+        run: |
+          echo "GCS_UPLOAD_DIR=${GCS_UPLOAD_ROOT_DIR}/${BENCHMARK_DEVICE}_$(date +'%Y-%m-%d').$(date +'%s')" >> $GITHUB_ENV
+          mkdir "${LOCAL_OUTPUT_DIR}"
+      - name: "Benchmarking TF/XLA:CPU"
+        run: |
+          RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${TF_RESULTS_JSON}"
+          docker build --file "oobi/build_tools/docker/dockerfiles/tensorflow${TF_VERSION}.Dockerfile" \
+            --tag "tensorflow${TF_VERSION}" "oobi/build_tools/docker/context"
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+          "tensorflow${TF_VERSION}:latest" \
+            ./iree-tf/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${TF_VERSION}" "${RESULTS_PATH}"
+          cat "${RESULTS_PATH}"
+          gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
+      - name: "Benchmarking JAX/XLA:CPU"
+        run: |
+          RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${JAX_RESULTS_JSON}"
+          docker build --file "oobi/build_tools/docker/dockerfiles/base.Dockerfile" \
+            --tag "base" "oobi/build_tools/docker/context"
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+          "base:latest" \
+          ./iree-jax/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${RESULTS_PATH}"
+          cat "${RESULTS_PATH}"
+          gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .vscode/
 __pycache__
 *.venv/
+.python-version
 
 # CMake artifacts
 build/

diff --git a/iree-jax/benchmark/benchmark_all.sh b/iree-jax/benchmark/benchmark_all.sh
@@ -9,9 +9,9 @@ VENV_DIR="jax-benchmarks.venv"
 VENV_DIR="${VENV_DIR}" ${TD}/setup_venv.sh
 source ${VENV_DIR}/bin/activate
 
-MODEL_RESNET50_FP32_JAX="c0a738bc-0c21-40b6-b565-31fe7fd33d0d"
-MODEL_BERT_LARGE_FP32_JAX="f76dc3a5-3379-49ab-85e5-744ff5167310"
-MODEL_T5_LARGE_FP32_JAX="7720beef-ac1a-4a5f-8777-505ea949a138"
+MODEL_RESNET50_FP32_JAX="aff75509-4420-40a8-844e-dbfc48494fe6-MODEL_RESNET50-fp32-JAX-3x224x224xf32"
+MODEL_BERT_LARGE_FP32_JAX="47cb0d3a-5eb7-41c7-9d7c-97aae7023ecf-MODEL_BERT_LARGE-fp32-JAX-384xi32"
+MODEL_T5_LARGE_FP32_JAX="173c7180-bad4-4b91-8423-4beeb13d2b0a-MODEL_T5_LARGE-fp32-JAX-512xi32"
 
 declare -a gpu_benchmark_ids=(
   "${MODEL_RESNET50_FP32_JAX}-batch1"
@@ -52,10 +52,10 @@ declare -a cpu_benchmark_ids=(
 
 if [ "${DEVICE}" = "gpu" ]; then
     BENCHMARK_IDS=("${gpu_benchmark_ids[@]}")
-    ITERATIONS=5
+    ITERATIONS=50
 else
     BENCHMARK_IDS=("${cpu_benchmark_ids[@]}")
-    ITERATIONS=5
+    ITERATIONS=20
 fi
 
 # Create json file and populate with global information.

diff --git a/iree-jax/benchmark/benchmark_model.py b/iree-jax/benchmark/benchmark_model.py
@@ -15,17 +15,15 @@
 from models import bert_large, resnet50, t5_large
 
 # Add benchmark definitions to the search path.
-sys.path.insert(
-    0,
-    str(
-        pathlib.Path(__file__).parent.parent.parent / "oobi" /
-        "benchmark-definitions" / "python"))
+sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent / "oobi" / "benchmark-definitions" / "python"))
 import data_types, jax_model_definitions, unique_ids
-
+from utils import execution_environment
 
 def benchmark_lookup(unique_id: str):
   if unique_id not in jax_model_definitions.JAX_MODELS_DICT:
-    raise ValueError(f"Id {unique_id} does not exist in model suite.")
+    id_list = '\n  '.join(jax_model_definitions.JAX_MODELS_DICT.keys())
+    raise ValueError(f"Id {unique_id} does not exist in model suite. Expected "
+                     f"one of:\n  {id_list}")
 
   model_definition = jax_model_definitions.JAX_MODELS_DICT[unique_id]
   if unique_id.startswith(unique_ids.MODEL_RESNET50_FP32_JAX):
@@ -42,7 +40,11 @@ def dump_result(file_path: str, result: dict) -> None:
   with open(file_path, "r") as f:
     dictObj = json.load(f)
 
+  dictObj["execution_environment"] = {
+      "python_environment": execution_environment.get_python_environment_info()
+  }
   dictObj["benchmarks"].append(result)
+
   with open(file_path, "w") as f:
     json.dump(dictObj, f)
 
@@ -75,7 +77,7 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
         end = time.perf_counter()
         latency = 1000 * (end - start)
         if i == 0:
-          compilation_time_s = latency / 1000
+          compile_time_s = latency / 1000
         warmup_latencies.append(latency)
 
       # Run benchmark.
@@ -88,35 +90,20 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
 
       # Save results.
       result_dict = {
-        "min_warmup_latency_ms":
-           "n/a" if not warmup_latencies else str(min(warmup_latencies)),
-        "max_warmup_latency_ms":
-            "n/a" if not warmup_latencies else str(max(warmup_latencies)),
-        "mean_warmup_latency_ms":
-            "n/a" if not warmup_latencies else str(
-                statistics.mean(warmup_latencies)),
-        "median_warmup_latency_ms":
-            "n/a" if not warmup_latencies else str(
-                statistics.median(warmup_latencies)),
-        "stddev_warmup_latency_ms":
-            "n/a" if not warmup_latencies else str(
-                statistics.stdev(warmup_latencies)),
-        "warmup_iterations":
-            str(warmup_iterations),
-        "min_latency_ms":
-            str(min(latencies)),
-        "max_latency_ms":
-            str(max(latencies)),
-        "mean_latency_ms":
-            str(statistics.mean(latencies)),
-        "median_latency_ms":
-            str(statistics.median(latencies)),
-        "stddev_latency_ms":
-            str(statistics.stdev(latencies)),
-        "benchmark_iterations":
-            str(benchmark_iterations),
-        "compile_time_s": "n/a" if not warmup_latencies else str(compilation_time_s),
-        "input_data_transfer_ms": str(input_data_transfer_ms),
+          "min_warmup_latency_ms": min(warmup_latencies, default=None),
+          "max_warmup_latency_ms": max(warmup_latencies, default=None),
+          "mean_warmup_latency_ms": None if not warmup_latencies else statistics.mean(warmup_latencies),
+          "median_warmup_latency_ms": None if not warmup_latencies else statistics.median(warmup_latencies),
+          "stddev_warmup_latency_ms": None if not warmup_latencies else statistics.stdev(warmup_latencies),
+          "warmup_iterations": warmup_iterations,
+          "min_latency_ms": min(latencies, default=None),
+          "max_latency_ms": max(latencies, default=None),
+          "mean_latency_ms": None if not latencies else statistics.mean(latencies),
+          "median_latency_ms": None if not latencies else statistics.median(latencies),
+          "stddev_latency_ms": None if not latencies else statistics.stdev(latencies),
+          "benchmark_iterations": benchmark_iterations,
+          "compile_time_s": compile_time_s,
+          "input_data_transfer_ms": input_data_transfer_ms,
       }
       shared_dict.update(result_dict)
 
@@ -155,7 +142,7 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
                          help="The path to `run_hlo_module`.")
   argParser.add_argument(
       "--run_in_process",
-      action=argparse.BooleanOptionalAction,
+      action="store_true",
       help=
       "Whether to run the benchmark under the same process. Set this to true when profiling a single workload"
   )
@@ -172,10 +159,14 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
   benchmark_definition = {
       "benchmark_id": args.benchmark_id,
       "benchmark_name": model_definition.name,
-      "batch_size": str(batch_size),
-      "framework": "jax",
+      "framework": str(model_definition.meta_model.framework_type),
+      "data_type": str(model_definition.meta_model.data_type),
+      "batch_size": batch_size,
+      "inputs": model_definition.inputs.tensor_dimensions,
+      "outputs": model_definition.outputs.tensor_dimensions,
       "compiler": "xla",
       "device": args.device,
+      "tags": model_definition.meta_model.tags + model_definition.tags,
   }
 
   framework_metrics = {}
@@ -203,5 +194,5 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
           "framework_level": framework_metrics,
       }
   }
-  print(result)
+  print(json.dumps(result, indent=2))
   dump_result(args.output_path, result)
diff --git a/iree-tf/benchmark/benchmark_all.sh b/iree-tf/benchmark/benchmark_all.sh
@@ -10,9 +10,9 @@ VENV_DIR="tf-benchmarks.venv"
 VENV_DIR="${VENV_DIR}" TENSORFLOW_VERSION="${TENSORFLOW_VERSION}" ${TD}/setup_venv.sh
 source ${VENV_DIR}/bin/activate
 
-MODEL_RESNET50_FP32_TF="2e1bd635-eeb3-41fa-90a6-e1cfdfa9be0a"
-MODEL_BERT_LARGE_FP32_TF="979ff492-f363-4320-875f-e1ef93521132"
-MODEL_T5_LARGE_FP32_TF="723da674-f42e-4d14-991e-16ad86a0d81b"
+MODEL_RESNET50_FP32_TF="aff75509-4420-40a8-844e-dbfc48494fe6-MODEL_RESNET50-fp32-TF-224x224x3xf32"
+MODEL_BERT_LARGE_FP32_TF="47cb0d3a-5eb7-41c7-9d7c-97aae7023ecf-MODEL_BERT_LARGE-fp32-TF-384xi32"
+MODEL_T5_LARGE_FP32_TF="173c7180-bad4-4b91-8423-4beeb13d2b0a-MODEL_T5_LARGE-fp32-TF-512xi32"
 
 declare -a gpu_benchmark_ids=(
   "${MODEL_RESNET50_FP32_TF}-batch1"
@@ -59,6 +59,10 @@ else
     ITERATIONS=20
 fi
 
+# Compiler-level benchmarks compile and run an inference per iteration.
+# We keep this number low to reduce total benchmark time.
+HLO_ITERATIONS=3
+
 # Create json file and populate with global information.
 rm "${OUTPUT_PATH}"
 echo "{\"trigger\": { \"timestamp\": \"$(date +'%s')\" }, \"benchmarks\": []}" > "${OUTPUT_PATH}"
@@ -69,7 +73,7 @@ for benchmark_id in "${BENCHMARK_IDS[@]}"; do
     --device="${DEVICE}"
     --output_path="${OUTPUT_PATH}"
     --iterations="${ITERATIONS}"
-    --hlo_iterations="${ITERATIONS}"
+    --hlo_iterations="${HLO_ITERATIONS}"
   )
 
   if [ -z "${TF_RUN_HLO_MODULE_PATH}" ]; then