.github/workflows/maven_test.yml

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Build and test using Maven

on:
  workflow_call:
    inputs:
      java:
        required: false
        type: string
        default: 17
      branch:
        description: Branch to run the build against
        required: false
        type: string
        default: master
      hadoop:
        description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
        required: false
        type: string
        default: hadoop3
      envs:
        description: Additional environment variables to set when running the tests. Should be in JSON format.
        required: false
        type: string
        default: '{}'
jobs:
  # Build: build Spark and run the tests for specified modules using maven
  build:
    name: "Build modules using Maven: ${{ matrix.modules }} ${{ matrix.comment }}"
    runs-on: ubuntu-22.04
    strategy:
      fail-fast: false
      matrix:
        java:
          - ${{ inputs.java }}
        hadoop:
          - ${{ inputs.hadoop }}
        hive:
          - hive2.3
        modules:
          - >-
            core,launcher,common#unsafe,common#kvstore,common#network-common,common#network-shuffle,common#sketch,common#utils
          - >-
            graphx,streaming,hadoop-cloud
          - >-
            mllib-local,mllib
          - >-
            repl,sql#hive-thriftserver
          - >-
            connector#kafka-0-10,connector#kafka-0-10-sql,connector#kafka-0-10-token-provider,connector#spark-ganglia-lgpl,connector#protobuf,connector#avro
          - >-
            sql#api,sql#catalyst,resource-managers#yarn,resource-managers#kubernetes#core
          - >-
            connect
        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
        included-tags: [ "" ]
        excluded-tags: [ "" ]
        comment: [ "" ]
        include:
          # Hive tests
          - modules: sql#hive
            java: ${{ inputs.java }}
            hadoop: ${{ inputs.hadoop }}
            hive: hive2.3
            included-tags: org.apache.spark.tags.SlowHiveTest
            comment: "- slow tests"
          - modules: sql#hive
            java: ${{ inputs.java }}
            hadoop: ${{ inputs.hadoop }}
            hive: hive2.3
            excluded-tags: org.apache.spark.tags.SlowHiveTest
            comment: "- other tests"
          # SQL tests
          - modules: sql#core
            java: ${{ inputs.java }}
            hadoop: ${{ inputs.hadoop }}
            hive: hive2.3
            included-tags: org.apache.spark.tags.ExtendedSQLTest
            comment: "- extended tests"
          - modules: sql#core
            java: ${{ inputs.java }}
            hadoop: ${{ inputs.hadoop }}
            hive: hive2.3
            included-tags: org.apache.spark.tags.SlowSQLTest
            comment: "- slow tests"
          - modules: sql#core
            java: ${{ inputs.java }}
            hadoop: ${{ inputs.hadoop }}
            hive: hive2.3
            excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
            comment: "- other tests"
    env:
      MODULES_TO_TEST: ${{ matrix.modules }}
      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
      INCLUDED_TAGS: ${{ matrix.included-tags }}
      HADOOP_PROFILE: ${{ matrix.hadoop }}
      HIVE_PROFILE: ${{ matrix.hive }}
      SPARK_LOCAL_IP: localhost
      GITHUB_PREV_SHA: ${{ github.event.before }}
    steps:
      - name: Checkout Spark repository
        uses: actions/checkout@v4
        # In order to fetch changed files
        with:
          fetch-depth: 0
          repository: apache/spark
          ref: ${{ inputs.branch }}
      - name: Sync the current branch with the latest in Apache Spark
        if: github.repository != 'apache/spark'
        run: |
          echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
      # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
      - name: Cache Scala, SBT and Maven
        uses: actions/cache@v3
        with:
          path: |
            build/apache-maven-*
            build/scala-*
            build/*.jar
            ~/.sbt
          key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
          restore-keys: |
            build-
      - name: Cache Maven local repository
        uses: actions/cache@v3
        with:
          path: ~/.m2/repository
          key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
          restore-keys: |
            java${{ matrix.java }}-maven-
      - name: Install Java ${{ matrix.java }}
        uses: actions/setup-java@v4
        with:
          distribution: zulu
          java-version: ${{ matrix.java }}
      - name: Install Python 3.9
        uses: actions/setup-python@v5
        # We should install one Python that is higher than 3+ for SQL and Yarn because:
        # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
        # - Yarn has a Python specific test too, for example, YarnClusterSuite.
        if: contains(matrix.modules, 'resource-managers#yarn') || (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
        with:
          python-version: '3.9'
          architecture: x64
      - name: Install Python packages (Python 3.9)
        if: (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
        run: |
          python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1'
          python3.9 -m pip list
      # Run the tests.
      - name: Run tests
        env: ${{ fromJSON(inputs.envs) }}
        shell: 'script -q -e -c "bash {0}"'
        run: |
          # Fix for TTY related issues when launching the Ammonite REPL in tests.
          export TERM=vt100
          # `set -e` to make the exit status as expected due to use `script -q -e -c` to run the commands
          set -e
          export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
          export MAVEN_CLI_OPTS="--no-transfer-progress"
          export JAVA_VERSION=${{ matrix.java }}
          # Replace with the real module name, for example, connector#kafka-0-10 -> connector/kafka-0-10
          export TEST_MODULES=`echo "$MODULES_TO_TEST" | sed -e "s%#%/%g"`
          ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} clean install
          if [[ "$INCLUDED_TAGS" != "" ]]; then
            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
          elif [[ "$EXCLUDED_TAGS" != "" ]]; then
            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
          elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
            ./build/mvn $MAVEN_CLI_OPTS -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connector/connect/common,connector/connect/server test -fae
          elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then
            # To avoid a compilation loop, for the `sql/hive-thriftserver` module, run `clean install` instead
            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Djava.version=${JAVA_VERSION/-ea} clean install -fae
          else
            ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Pspark-ganglia-lgpl -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} test -fae
          fi
      - name: Clean up local Maven repository
        run: |  
          rm -rf ~/.m2/repository/org/apache/spark
      - name: Upload test results to report
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
          path: "**/target/test-reports/*.xml"
      - name: Upload unit tests log files
        if: failure()
        uses: actions/upload-artifact@v4
        with:
          name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
          path: "**/target/unit-tests.log"