From 3015699966e7113af89486934ab37b9c28102869 Mon Sep 17 00:00:00 2001 From: Michael Hudgins Date: Wed, 15 May 2024 13:46:40 -0400 Subject: [PATCH 1/4] Add core count to tpu nightly fix v5 job The current job assumes a 4 core TPU. Modify the matrix to enable defining the core count for each tpu --- .github/workflows/cloud-tpu-ci-nightly.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cloud-tpu-ci-nightly.yml b/.github/workflows/cloud-tpu-ci-nightly.yml index b4356ba64d7c..50a5482912cb 100644 --- a/.github/workflows/cloud-tpu-ci-nightly.yml +++ b/.github/workflows/cloud-tpu-ci-nightly.yml @@ -25,12 +25,16 @@ jobs: fail-fast: false # don't cancel all jobs on failure matrix: jaxlib-version: ["pypi_latest", "nightly", "nightly+oldest_supported_libtpu"] - tpu-type: ["v3-8", "v4-8", "v5e-4"] - name: "TPU test (jaxlib=${{ matrix.jaxlib-version }}, ${{ matrix.tpu-type }})" + tpu: [ + {type: "v3-8", core: "4"}, + {type: "v4-8", core: "4"}, + {type: "v5e-8", core: "8"} + ] + name: "TPU test (jaxlib=${{ matrix.jaxlib-version }}, ${{ matrix.tpu.type }})" env: LIBTPU_OLDEST_VERSION_DATE: 20240228 ENABLE_PJRT_COMPATIBILITY: ${{ matrix.jaxlib-version == 'nightly+oldest_supported_libtpu' }} - runs-on: ["self-hosted", "tpu", "${{ matrix.tpu-type }}"] + runs-on: ["self-hosted", "tpu", "${{ matrix.tpu.type }}"] timeout-minutes: 120 defaults: run: @@ -84,7 +88,7 @@ jobs: PY_COLORS: 1 run: | # Run single-accelerator tests in parallel - JAX_ENABLE_TPU_XDIST=true python3 -m pytest -n=4 --tb=short \ + JAX_ENABLE_TPU_XDIST=true python3 -m pytest -n={{ matrix.tpu.core }} --tb=short \ --maxfail=20 -m "not multiaccelerator" tests examples # Run multi-accelerator across all chips python3 -m pytest --tb=short --maxfail=20 -m "multiaccelerator" tests @@ -95,5 +99,5 @@ jobs: curl --location --request POST '${{ secrets.BUILD_CHAT_WEBHOOK }}' \ --header 'Content-Type: application/json' \ --data-raw "{ - 'text': '\"$GITHUB_WORKFLOW\", jaxlib/libtpu version \"${{ matrix.jaxlib-version }}\", TPU type ${{ matrix.tpu-type }} job failed, timed out, or was cancelled: $GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID' + 'text': '\"$GITHUB_WORKFLOW\", jaxlib/libtpu version \"${{ matrix.jaxlib-version }}\", TPU type ${{ matrix.tpu.type }} job failed, timed out, or was cancelled: $GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID' }" From 181da1280985c2d63365f175b60b51a040964636 Mon Sep 17 00:00:00 2001 From: Michael Hudgins Date: Wed, 15 May 2024 14:33:01 -0400 Subject: [PATCH 2/4] Add missing bash symbol --- .github/workflows/cloud-tpu-ci-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cloud-tpu-ci-nightly.yml b/.github/workflows/cloud-tpu-ci-nightly.yml index 50a5482912cb..9e4a0bd37c20 100644 --- a/.github/workflows/cloud-tpu-ci-nightly.yml +++ b/.github/workflows/cloud-tpu-ci-nightly.yml @@ -88,7 +88,7 @@ jobs: PY_COLORS: 1 run: | # Run single-accelerator tests in parallel - JAX_ENABLE_TPU_XDIST=true python3 -m pytest -n={{ matrix.tpu.core }} --tb=short \ + JAX_ENABLE_TPU_XDIST=true python3 -m pytest -n=${{ matrix.tpu.core }} --tb=short \ --maxfail=20 -m "not multiaccelerator" tests examples # Run multi-accelerator across all chips python3 -m pytest --tb=short --maxfail=20 -m "multiaccelerator" tests From 0232cb9f8d365cc9e4445bf26efae6fedf9f3cae Mon Sep 17 00:00:00 2001 From: Michael Hudgins Date: Wed, 15 May 2024 14:37:27 -0400 Subject: [PATCH 3/4] Run v3-8 tests with cores set at 8 --- .github/workflows/cloud-tpu-ci-nightly.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cloud-tpu-ci-nightly.yml b/.github/workflows/cloud-tpu-ci-nightly.yml index 9e4a0bd37c20..bc36daa6c9be 100644 --- a/.github/workflows/cloud-tpu-ci-nightly.yml +++ b/.github/workflows/cloud-tpu-ci-nightly.yml @@ -26,9 +26,9 @@ jobs: matrix: jaxlib-version: ["pypi_latest", "nightly", "nightly+oldest_supported_libtpu"] tpu: [ - {type: "v3-8", core: "4"}, - {type: "v4-8", core: "4"}, - {type: "v5e-8", core: "8"} + {type: "v3-8", cores: "8"}, + {type: "v4-8", cores: "4"}, + {type: "v5e-8", cores: "8"} ] name: "TPU test (jaxlib=${{ matrix.jaxlib-version }}, ${{ matrix.tpu.type }})" env: @@ -88,7 +88,7 @@ jobs: PY_COLORS: 1 run: | # Run single-accelerator tests in parallel - JAX_ENABLE_TPU_XDIST=true python3 -m pytest -n=${{ matrix.tpu.core }} --tb=short \ + JAX_ENABLE_TPU_XDIST=true python3 -m pytest -n=${{ matrix.tpu.cores }} --tb=short \ --maxfail=20 -m "not multiaccelerator" tests examples # Run multi-accelerator across all chips python3 -m pytest --tb=short --maxfail=20 -m "multiaccelerator" tests From 76a7b19c1019c42ad8b1c22387b7202910e68d4f Mon Sep 17 00:00:00 2001 From: Michael Hudgins Date: Wed, 15 May 2024 14:40:57 -0400 Subject: [PATCH 4/4] Set V3 cores to 4 --- .github/workflows/cloud-tpu-ci-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cloud-tpu-ci-nightly.yml b/.github/workflows/cloud-tpu-ci-nightly.yml index bc36daa6c9be..c7d3afd4c4a9 100644 --- a/.github/workflows/cloud-tpu-ci-nightly.yml +++ b/.github/workflows/cloud-tpu-ci-nightly.yml @@ -26,7 +26,7 @@ jobs: matrix: jaxlib-version: ["pypi_latest", "nightly", "nightly+oldest_supported_libtpu"] tpu: [ - {type: "v3-8", cores: "8"}, + {type: "v3-8", cores: "4"}, {type: "v4-8", cores: "4"}, {type: "v5e-8", cores: "8"} ]