Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: iree-org/iree-experimental
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 1b65cec90adc3c7a70c6cdfd81c75ba3f2e62235
Choose a base ref
..
head repository: iree-org/iree-experimental
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 10ff9ed1e2c9af21bc1c04c05b447ed464068f11
Choose a head ref
Showing with 892 additions and 536 deletions.
  1. +0 −85 .github/workflows/benchmark_xla.yml
  2. +102 −0 .github/workflows/run_comparative_benchmarks.yml
  3. +1 −0 .gitignore
  4. +5 −5 iree-jax/benchmark/benchmark_all.sh
  5. +32 −41 iree-jax/benchmark/benchmark_model.py
  6. +8 −4 iree-tf/benchmark/benchmark_all.sh
  7. +57 −62 iree-tf/benchmark/benchmark_model.py
  8. +0 −3 iree-tf/benchmark/requirements.txt
  9. +9 −0 iree-tf/benchmark/setup_venv.sh
  10. +70 −22 iree-torch/library/import_models.py
  11. +6 −2 iree-torch/library/import_torch_models.sh
  12. +2 −2 iree-torch/library/models/bert_large.py
  13. +3 −1 iree-torch/library/models/efficientnet_b7.py
  14. +2 −2 iree-torch/library/models/efficientnet_v2_s.py
  15. +2 −2 iree-torch/library/models/resnet50.py
  16. +3 −1 iree-torch/library/models/sd_clip_text_model.py
  17. +3 −1 iree-torch/library/models/sd_unet_model.py
  18. +3 −1 iree-torch/library/models/sd_vae_model.py
  19. +3 −1 iree-torch/library/models/t5_large.py
  20. +12 −1 iree-torch/library/setup_venv.sh
  21. +22 −4 oobi/benchmark-definitions/python/data_types.py
  22. +12 −12 oobi/benchmark-definitions/python/input_data_definitions.py
  23. +77 −105 oobi/benchmark-definitions/python/jax_model_definitions.py
  24. +148 −127 oobi/benchmark-definitions/python/tf_model_definitions.py
  25. +6 −6 oobi/benchmark-definitions/python/tf_output_data_definitions.py
  26. +64 −46 oobi/benchmark-definitions/python/unique_ids.py
  27. 0 oobi/benchmark-definitions/python/utils/__init__.py
  28. +17 −0 oobi/benchmark-definitions/python/utils/execution_environment.py
  29. +34 −0 oobi/build_tools/docker/dockerfiles/base.Dockerfile
  30. +58 −0 oobi/build_tools/docker/dockerfiles/cuda11.8-cudnn8.9.Dockerfile
  31. +78 −0 oobi/build_tools/docker/dockerfiles/tensorflow2.13.0-rc0-cuda11.8-cudnn8.9.Dockerfile
  32. +53 −0 oobi/build_tools/docker/dockerfiles/tensorflow2.13.0-rc0.Dockerfile
85 changes: 0 additions & 85 deletions .github/workflows/benchmark_xla.yml

This file was deleted.

102 changes: 102 additions & 0 deletions .github/workflows/run_comparative_benchmarks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright 2023 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Comparative Benchmarks Workflow.

name: Comparative Benchmarks

on:
schedule:
# Scheduled to run at 09:00 UTC and 21:00 UTC.
- cron: '0 09,21 * * *'
workflow_dispatch:
pull_request:

jobs:
benchmark_gpu:
runs-on:
- self-hosted # must come first
- runner-group=presubmit
- environment=testing
- gpu
- os-family=Linux
env:
TF_VERSION: 2.12.0
LOCAL_OUTPUT_DIR: results-dir
TF_RESULTS_JSON: tf-xla.json
JAX_RESULTS_JSON: jax-xla.json
BENCHMARK_DEVICE: gpu
GCS_UPLOAD_ROOT_DIR: "gs://comparative-benchmark-artifacts"
steps:
- name: "Checking out PR repository"
uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
- name: "Setup"
run: |
echo "GCS_UPLOAD_DIR=${GCS_UPLOAD_ROOT_DIR}/${BENCHMARK_DEVICE}_$(date +'%Y-%m-%d').$(date +'%s')" >> $GITHUB_ENV
mkdir "${LOCAL_OUTPUT_DIR}"
- name: "Benchmarking TF/XLA:GPU"
run: |
RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${TF_RESULTS_JSON}"
docker build --file "oobi/build_tools/docker/dockerfiles/tensorflow${TF_VERSION}-cuda11.8-cudnn8.9.Dockerfile" \
--tag "tensorflow${TF_VERSION}-cuda11.8-cudnn8.9" "oobi/build_tools/docker/context"
docker run --gpus all --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
"tensorflow${TF_VERSION}-cuda11.8-cudnn8.9:latest" \
./iree-tf/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${TF_VERSION}" "${RESULTS_PATH}"
cat "${RESULTS_PATH}"
gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
- name: "Benchmarking JAX/XLA:GPU"
run: |
RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${JAX_RESULTS_JSON}"
docker build --file "oobi/build_tools/docker/dockerfiles/cuda11.8-cudnn8.9.Dockerfile" \
--tag "cuda11.8-cudnn8.9" "oobi/build_tools/docker/context"
docker run --gpus all --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
"cuda11.8-cudnn8.9:latest" \
./iree-jax/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${RESULTS_PATH}"
cat "${RESULTS_PATH}"
gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
benchmark_cpu:
runs-on:
- self-hosted # must come first
- runner-group=presubmit
- environment=testing
- machine-type=c2-standard-16
- os-family=Linux
env:
TF_VERSION: 2.12.0
LOCAL_OUTPUT_DIR: results-dir
TF_RESULTS_JSON: tf-xla.json
JAX_RESULTS_JSON: jax-xla.json
BENCHMARK_DEVICE: cpu
GCS_UPLOAD_ROOT_DIR: "gs://comparative-benchmark-artifacts"
steps:
- name: "Checking out PR repository"
uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
- name: "Setup"
run: |
echo "GCS_UPLOAD_DIR=${GCS_UPLOAD_ROOT_DIR}/${BENCHMARK_DEVICE}_$(date +'%Y-%m-%d').$(date +'%s')" >> $GITHUB_ENV
mkdir "${LOCAL_OUTPUT_DIR}"
- name: "Benchmarking TF/XLA:CPU"
run: |
RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${TF_RESULTS_JSON}"
docker build --file "oobi/build_tools/docker/dockerfiles/tensorflow${TF_VERSION}.Dockerfile" \
--tag "tensorflow${TF_VERSION}" "oobi/build_tools/docker/context"
docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
"tensorflow${TF_VERSION}:latest" \
./iree-tf/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${TF_VERSION}" "${RESULTS_PATH}"
cat "${RESULTS_PATH}"
gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
- name: "Benchmarking JAX/XLA:CPU"
run: |
RESULTS_PATH="${LOCAL_OUTPUT_DIR}/${JAX_RESULTS_JSON}"
docker build --file "oobi/build_tools/docker/dockerfiles/base.Dockerfile" \
--tag "base" "oobi/build_tools/docker/context"
docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
"base:latest" \
./iree-jax/benchmark/benchmark_all.sh "${BENCHMARK_DEVICE}" "${RESULTS_PATH}"
cat "${RESULTS_PATH}"
gcloud storage cp "${RESULTS_PATH}" "${GCS_UPLOAD_DIR}/"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.vscode/
__pycache__
*.venv/
.python-version

# CMake artifacts
build/
10 changes: 5 additions & 5 deletions iree-jax/benchmark/benchmark_all.sh
Original file line number Diff line number Diff line change
@@ -9,9 +9,9 @@ VENV_DIR="jax-benchmarks.venv"
VENV_DIR="${VENV_DIR}" ${TD}/setup_venv.sh
source ${VENV_DIR}/bin/activate

MODEL_RESNET50_FP32_JAX="c0a738bc-0c21-40b6-b565-31fe7fd33d0d"
MODEL_BERT_LARGE_FP32_JAX="f76dc3a5-3379-49ab-85e5-744ff5167310"
MODEL_T5_LARGE_FP32_JAX="7720beef-ac1a-4a5f-8777-505ea949a138"
MODEL_RESNET50_FP32_JAX="aff75509-4420-40a8-844e-dbfc48494fe6-MODEL_RESNET50-fp32-JAX-3x224x224xf32"
MODEL_BERT_LARGE_FP32_JAX="47cb0d3a-5eb7-41c7-9d7c-97aae7023ecf-MODEL_BERT_LARGE-fp32-JAX-384xi32"
MODEL_T5_LARGE_FP32_JAX="173c7180-bad4-4b91-8423-4beeb13d2b0a-MODEL_T5_LARGE-fp32-JAX-512xi32"

declare -a gpu_benchmark_ids=(
"${MODEL_RESNET50_FP32_JAX}-batch1"
@@ -52,10 +52,10 @@ declare -a cpu_benchmark_ids=(

if [ "${DEVICE}" = "gpu" ]; then
BENCHMARK_IDS=("${gpu_benchmark_ids[@]}")
ITERATIONS=5
ITERATIONS=50
else
BENCHMARK_IDS=("${cpu_benchmark_ids[@]}")
ITERATIONS=5
ITERATIONS=20
fi

# Create json file and populate with global information.
73 changes: 32 additions & 41 deletions iree-jax/benchmark/benchmark_model.py
Original file line number Diff line number Diff line change
@@ -15,17 +15,15 @@
from models import bert_large, resnet50, t5_large

# Add benchmark definitions to the search path.
sys.path.insert(
0,
str(
pathlib.Path(__file__).parent.parent.parent / "oobi" /
"benchmark-definitions" / "python"))
sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent / "oobi" / "benchmark-definitions" / "python"))
import data_types, jax_model_definitions, unique_ids

from utils import execution_environment

def benchmark_lookup(unique_id: str):
if unique_id not in jax_model_definitions.JAX_MODELS_DICT:
raise ValueError(f"Id {unique_id} does not exist in model suite.")
id_list = '\n '.join(jax_model_definitions.JAX_MODELS_DICT.keys())
raise ValueError(f"Id {unique_id} does not exist in model suite. Expected "
f"one of:\n {id_list}")

model_definition = jax_model_definitions.JAX_MODELS_DICT[unique_id]
if unique_id.startswith(unique_ids.MODEL_RESNET50_FP32_JAX):
@@ -42,7 +40,11 @@ def dump_result(file_path: str, result: dict) -> None:
with open(file_path, "r") as f:
dictObj = json.load(f)

dictObj["execution_environment"] = {
"python_environment": execution_environment.get_python_environment_info()
}
dictObj["benchmarks"].append(result)

with open(file_path, "w") as f:
json.dump(dictObj, f)

@@ -75,7 +77,7 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
end = time.perf_counter()
latency = 1000 * (end - start)
if i == 0:
compilation_time_s = latency / 1000
compile_time_s = latency / 1000
warmup_latencies.append(latency)

# Run benchmark.
@@ -88,35 +90,20 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,

# Save results.
result_dict = {
"min_warmup_latency_ms":
"n/a" if not warmup_latencies else str(min(warmup_latencies)),
"max_warmup_latency_ms":
"n/a" if not warmup_latencies else str(max(warmup_latencies)),
"mean_warmup_latency_ms":
"n/a" if not warmup_latencies else str(
statistics.mean(warmup_latencies)),
"median_warmup_latency_ms":
"n/a" if not warmup_latencies else str(
statistics.median(warmup_latencies)),
"stddev_warmup_latency_ms":
"n/a" if not warmup_latencies else str(
statistics.stdev(warmup_latencies)),
"warmup_iterations":
str(warmup_iterations),
"min_latency_ms":
str(min(latencies)),
"max_latency_ms":
str(max(latencies)),
"mean_latency_ms":
str(statistics.mean(latencies)),
"median_latency_ms":
str(statistics.median(latencies)),
"stddev_latency_ms":
str(statistics.stdev(latencies)),
"benchmark_iterations":
str(benchmark_iterations),
"compile_time_s": "n/a" if not warmup_latencies else str(compilation_time_s),
"input_data_transfer_ms": str(input_data_transfer_ms),
"min_warmup_latency_ms": min(warmup_latencies, default=None),
"max_warmup_latency_ms": max(warmup_latencies, default=None),
"mean_warmup_latency_ms": None if not warmup_latencies else statistics.mean(warmup_latencies),
"median_warmup_latency_ms": None if not warmup_latencies else statistics.median(warmup_latencies),
"stddev_warmup_latency_ms": None if not warmup_latencies else statistics.stdev(warmup_latencies),
"warmup_iterations": warmup_iterations,
"min_latency_ms": min(latencies, default=None),
"max_latency_ms": max(latencies, default=None),
"mean_latency_ms": None if not latencies else statistics.mean(latencies),
"median_latency_ms": None if not latencies else statistics.median(latencies),
"stddev_latency_ms": None if not latencies else statistics.stdev(latencies),
"benchmark_iterations": benchmark_iterations,
"compile_time_s": compile_time_s,
"input_data_transfer_ms": input_data_transfer_ms,
}
shared_dict.update(result_dict)

@@ -155,7 +142,7 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
help="The path to `run_hlo_module`.")
argParser.add_argument(
"--run_in_process",
action=argparse.BooleanOptionalAction,
action="store_true",
help=
"Whether to run the benchmark under the same process. Set this to true when profiling a single workload"
)
@@ -172,10 +159,14 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
benchmark_definition = {
"benchmark_id": args.benchmark_id,
"benchmark_name": model_definition.name,
"batch_size": str(batch_size),
"framework": "jax",
"framework": str(model_definition.meta_model.framework_type),
"data_type": str(model_definition.meta_model.data_type),
"batch_size": batch_size,
"inputs": model_definition.inputs.tensor_dimensions,
"outputs": model_definition.outputs.tensor_dimensions,
"compiler": "xla",
"device": args.device,
"tags": model_definition.meta_model.tags + model_definition.tags,
}

framework_metrics = {}
@@ -203,5 +194,5 @@ def run_framework_benchmark(model_name: str, model_class: Any, batch_size: int,
"framework_level": framework_metrics,
}
}
print(result)
print(json.dumps(result, indent=2))
dump_result(args.output_path, result)
12 changes: 8 additions & 4 deletions iree-tf/benchmark/benchmark_all.sh
Original file line number Diff line number Diff line change
@@ -10,9 +10,9 @@ VENV_DIR="tf-benchmarks.venv"
VENV_DIR="${VENV_DIR}" TENSORFLOW_VERSION="${TENSORFLOW_VERSION}" ${TD}/setup_venv.sh
source ${VENV_DIR}/bin/activate

MODEL_RESNET50_FP32_TF="2e1bd635-eeb3-41fa-90a6-e1cfdfa9be0a"
MODEL_BERT_LARGE_FP32_TF="979ff492-f363-4320-875f-e1ef93521132"
MODEL_T5_LARGE_FP32_TF="723da674-f42e-4d14-991e-16ad86a0d81b"
MODEL_RESNET50_FP32_TF="aff75509-4420-40a8-844e-dbfc48494fe6-MODEL_RESNET50-fp32-TF-224x224x3xf32"
MODEL_BERT_LARGE_FP32_TF="47cb0d3a-5eb7-41c7-9d7c-97aae7023ecf-MODEL_BERT_LARGE-fp32-TF-384xi32"
MODEL_T5_LARGE_FP32_TF="173c7180-bad4-4b91-8423-4beeb13d2b0a-MODEL_T5_LARGE-fp32-TF-512xi32"

declare -a gpu_benchmark_ids=(
"${MODEL_RESNET50_FP32_TF}-batch1"
@@ -59,6 +59,10 @@ else
ITERATIONS=20
fi

# Compiler-level benchmarks compile and run an inference per iteration.
# We keep this number low to reduce total benchmark time.
HLO_ITERATIONS=3

# Create json file and populate with global information.
rm "${OUTPUT_PATH}"
echo "{\"trigger\": { \"timestamp\": \"$(date +'%s')\" }, \"benchmarks\": []}" > "${OUTPUT_PATH}"
@@ -69,7 +73,7 @@ for benchmark_id in "${BENCHMARK_IDS[@]}"; do
--device="${DEVICE}"
--output_path="${OUTPUT_PATH}"
--iterations="${ITERATIONS}"
--hlo_iterations="${ITERATIONS}"
--hlo_iterations="${HLO_ITERATIONS}"
)

if [ -z "${TF_RUN_HLO_MODULE_PATH}" ]; then
Loading