Skip to content

Commit c21a253

Browse files
authored
Merge pull request #965 from IBM/extreme_readability
Add extreme tokenize and readability transforms
2 parents df61f49 + 7d7acf9 commit c21a253

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+64450
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#
2+
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
3+
#
4+
name: Test - transforms/language/extreme_tokenized
5+
6+
on:
7+
workflow_dispatch:
8+
push:
9+
branches:
10+
- "dev"
11+
- "releases/**"
12+
tags:
13+
- "*"
14+
paths:
15+
- ".make.*"
16+
- "transforms/.make.transforms"
17+
- "transforms/language/extreme_tokenized/**"
18+
- "data-processing-lib/**"
19+
- "!transforms/language/extreme_tokenized/**/kfp_ray/**" # This is/will be tested in separate workflow
20+
- "!data-processing-lib/**/test/**"
21+
- "!data-processing-lib/**/test-data/**"
22+
- "!**.md"
23+
- "!**/doc/**"
24+
- "!**/images/**"
25+
- "!**.gitignore"
26+
pull_request:
27+
branches:
28+
- "dev"
29+
- "releases/**"
30+
paths:
31+
- ".make.*"
32+
- "transforms/.make.transforms"
33+
- "transforms/language/extreme_tokenized/**"
34+
- "data-processing-lib/**"
35+
- "!transforms/language/extreme_tokenized/**/kfp_ray/**" # This is/will be tested in separate workflow
36+
- "!data-processing-lib/**/test/**"
37+
- "!data-processing-lib/**/test-data/**"
38+
- "!**.md"
39+
- "!**/doc/**"
40+
- "!**/images/**"
41+
- "!**.gitignore"
42+
43+
# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
44+
concurrency:
45+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
46+
cancel-in-progress: true
47+
48+
jobs:
49+
check_if_push_image:
50+
# check whether the Docker images should be pushed to the remote repository
51+
# The images are pushed if it is a merge to dev branch or a new tag is created.
52+
# The latter being part of the release process.
53+
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
54+
runs-on: ubuntu-22.04
55+
outputs:
56+
publish_images: ${{ steps.version.outputs.publish_images }}
57+
steps:
58+
- id: version
59+
run: |
60+
publish_images='false'
61+
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
62+
then
63+
publish_images='true'
64+
fi
65+
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
66+
then
67+
publish_images='true'
68+
fi
69+
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
70+
test-src:
71+
runs-on: ubuntu-22.04
72+
steps:
73+
- name: Checkout
74+
uses: actions/checkout@v4
75+
- name: Free up space in github runner
76+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
77+
run: |
78+
df -h
79+
sudo rm -rf "/usr/local/share/boost"
80+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
81+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
82+
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
83+
df -h
84+
- name: Test transform source in transforms/language/extreme_tokenized
85+
run: |
86+
if [ -e "transforms/language/extreme_tokenized/Makefile" ]; then
87+
make -C transforms/language/extreme_tokenized DOCKER=docker test-src
88+
else
89+
echo "transforms/language/extreme_tokenized/Makefile not found - source testing disabled for this transform."
90+
fi
91+
test-image:
92+
needs: [check_if_push_image]
93+
runs-on: ubuntu-22.04
94+
timeout-minutes: 120
95+
env:
96+
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
97+
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
98+
steps:
99+
- name: Checkout
100+
uses: actions/checkout@v4
101+
- name: Free up space in github runner
102+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
103+
run: |
104+
df -h
105+
sudo rm -rf /opt/ghc
106+
sudo rm -rf "/usr/local/share/boost"
107+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
108+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
109+
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
110+
df -h
111+
- name: Test transform image in transforms/language/extreme_tokenized
112+
run: |
113+
if [ -e "transforms/language/extreme_tokenized/Makefile" ]; then
114+
if [ -d "transforms/language/extreme_tokenized/spark" ]; then
115+
make -C data-processing-lib/spark DOCKER=docker image
116+
fi
117+
make -C transforms/language/extreme_tokenized DOCKER=docker test-image
118+
else
119+
echo "transforms/language/extreme_tokenized/Makefile not found - testing disabled for this transform."
120+
fi
121+
- name: Print space
122+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
123+
run: |
124+
df -h
125+
docker images
126+
- name: Publish images
127+
if: needs.check_if_push_image.outputs.publish_images == 'true'
128+
run: |
129+
if [ -e "transforms/language/extreme_tokenized/Makefile" ]; then
130+
make -C transforms/language/extreme_tokenized publish
131+
else
132+
echo "transforms/language/extreme_tokenized/Makefile not found - publishing disabled for this transform."
133+
fi
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#
2+
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
3+
#
4+
name: Test - transforms/language/readability
5+
6+
on:
7+
workflow_dispatch:
8+
push:
9+
branches:
10+
- "dev"
11+
- "releases/**"
12+
tags:
13+
- "*"
14+
paths:
15+
- ".make.*"
16+
- "transforms/.make.transforms"
17+
- "transforms/language/readability/**"
18+
- "data-processing-lib/**"
19+
- "!transforms/language/readability/**/kfp_ray/**" # This is/will be tested in separate workflow
20+
- "!data-processing-lib/**/test/**"
21+
- "!data-processing-lib/**/test-data/**"
22+
- "!**.md"
23+
- "!**/doc/**"
24+
- "!**/images/**"
25+
- "!**.gitignore"
26+
pull_request:
27+
branches:
28+
- "dev"
29+
- "releases/**"
30+
paths:
31+
- ".make.*"
32+
- "transforms/.make.transforms"
33+
- "transforms/language/readability/**"
34+
- "data-processing-lib/**"
35+
- "!transforms/language/readability/**/kfp_ray/**" # This is/will be tested in separate workflow
36+
- "!data-processing-lib/**/test/**"
37+
- "!data-processing-lib/**/test-data/**"
38+
- "!**.md"
39+
- "!**/doc/**"
40+
- "!**/images/**"
41+
- "!**.gitignore"
42+
43+
# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
44+
concurrency:
45+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
46+
cancel-in-progress: true
47+
48+
jobs:
49+
check_if_push_image:
50+
# check whether the Docker images should be pushed to the remote repository
51+
# The images are pushed if it is a merge to dev branch or a new tag is created.
52+
# The latter being part of the release process.
53+
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
54+
runs-on: ubuntu-22.04
55+
outputs:
56+
publish_images: ${{ steps.version.outputs.publish_images }}
57+
steps:
58+
- id: version
59+
run: |
60+
publish_images='false'
61+
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
62+
then
63+
publish_images='true'
64+
fi
65+
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
66+
then
67+
publish_images='true'
68+
fi
69+
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
70+
test-src:
71+
runs-on: ubuntu-22.04
72+
steps:
73+
- name: Checkout
74+
uses: actions/checkout@v4
75+
- name: Free up space in github runner
76+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
77+
run: |
78+
df -h
79+
sudo rm -rf "/usr/local/share/boost"
80+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
81+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
82+
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
83+
df -h
84+
- name: Test transform source in transforms/language/readability
85+
run: |
86+
if [ -e "transforms/language/readability/Makefile" ]; then
87+
make -C transforms/language/readability DOCKER=docker test-src
88+
else
89+
echo "transforms/language/readability/Makefile not found - source testing disabled for this transform."
90+
fi
91+
test-image:
92+
needs: [check_if_push_image]
93+
runs-on: ubuntu-22.04
94+
timeout-minutes: 120
95+
env:
96+
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
97+
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
98+
steps:
99+
- name: Checkout
100+
uses: actions/checkout@v4
101+
- name: Free up space in github runner
102+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
103+
run: |
104+
df -h
105+
sudo rm -rf /opt/ghc
106+
sudo rm -rf "/usr/local/share/boost"
107+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
108+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
109+
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
110+
df -h
111+
- name: Test transform image in transforms/language/readability
112+
run: |
113+
if [ -e "transforms/language/readability/Makefile" ]; then
114+
if [ -d "transforms/language/readability/spark" ]; then
115+
make -C data-processing-lib/spark DOCKER=docker image
116+
fi
117+
make -C transforms/language/readability DOCKER=docker test-image
118+
else
119+
echo "transforms/language/readability/Makefile not found - testing disabled for this transform."
120+
fi
121+
- name: Print space
122+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
123+
run: |
124+
df -h
125+
docker images
126+
- name: Publish images
127+
if: needs.check_if_push_image.outputs.publish_images == 'true'
128+
run: |
129+
if [ -e "transforms/language/readability/Makefile" ]; then
130+
make -C transforms/language/readability publish
131+
else
132+
echo "transforms/language/readability/Makefile not found - publishing disabled for this transform."
133+
fi
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
FROM docker.io/python:3.10.14-slim-bullseye
2+
3+
RUN pip install --upgrade --no-cache-dir pip
4+
5+
# install pytest
6+
RUN pip install --no-cache-dir pytest
7+
8+
# Create a user and use it to run the transform
9+
RUN useradd -ms /bin/bash dpk
10+
USER dpk
11+
WORKDIR /home/dpk
12+
ARG DPK_WHEEL_FILE_NAME
13+
ARG TRANSFORM_NAME
14+
15+
# Copy and install data processing libraries
16+
# These are expected to be placed in the docker context before this is run (see the make image).
17+
COPY --chown=dpk:users data-processing-dist data-processing-dist
18+
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
19+
20+
# END OF STEPS destined for a data-prep-kit base image
21+
22+
COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
23+
COPY --chown=dpk:users requirements.txt requirements.txt
24+
RUN pip install --no-cache-dir -r requirements.txt
25+
26+
# Set environment
27+
ENV PYTHONPATH /home/dpk
28+
29+
# Put these at the end since they seem to upset the docker cache.
30+
ARG BUILD_DATE
31+
ARG GIT_COMMIT
32+
LABEL build-date=$BUILD_DATE
33+
LABEL git-commit=$GIT_COMMIT
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
2+
FROM ${BASE_IMAGE}
3+
4+
# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images
5+
USER root
6+
RUN chown ray:root /home/ray && chmod 775 /home/ray
7+
USER ray
8+
9+
RUN pip install --upgrade --no-cache-dir pip
10+
11+
# install pytest
12+
RUN pip install --no-cache-dir pytest
13+
ARG DPK_WHEEL_FILE_NAME
14+
ARG TRANSFORM_NAME
15+
16+
# Copy and install data processing libraries
17+
# These are expected to be placed in the docker context before this is run (see the make image).
18+
COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist
19+
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
20+
21+
22+
COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
23+
COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt
24+
RUN pip install --no-cache-dir -r requirements.txt
25+
26+
# Set environment
27+
ENV PYTHONPATH /home/ray
28+
29+
# Put these at the end since they seem to upset the docker cache.
30+
ARG BUILD_DATE
31+
ARG GIT_COMMIT
32+
LABEL build-date=$BUILD_DATE
33+
LABEL git-commit=$GIT_COMMIT
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
REPOROOT=../../..
2+
# Use make help, to see the available rules
3+
include $(REPOROOT)/transforms/.make.cicd.targets
4+
5+
# Until we make runtime.py the standard supported by Makefile infra
6+
TRANSFORM_PYTHON_SRC="-m dpk_$(TRANSFORM_NAME).runtime"
7+
TRANSFORM_RAY_SRC="-m dpk_$(TRANSFORM_NAME).ray.runtime"
8+
TRANSFORM_SPARK_SRC="-m dpk_$(TRANSFORM_NAME).spark.runtime"
9+
10+
#
11+
# This is intended to be included across the Makefiles provided within
12+
# a given transform's directory tree, so must use compatible syntax.
13+
#
14+
################################################################################
15+
# This defines the name of the transform and is used to match against
16+
# expected files and is used to define the transform's image name.
17+
TRANSFORM_NAME=$(shell basename `pwd`)
18+
19+
################################################################################
20+
21+

0 commit comments

Comments
 (0)