Skip to content

Commit

Permalink
Added rep_removal
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Jan 31, 2025
2 parents ec46288 + f9a3cc4 commit 813c7ca
Show file tree
Hide file tree
Showing 32 changed files with 104,498 additions and 3 deletions.
133 changes: 133 additions & 0 deletions .github/workflows/test-universal-rep_removal.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
#
name: Test - transforms/universal/rep_removal

on:
workflow_dispatch:
push:
branches:
- "dev"
- "releases/**"
tags:
- "*"
paths:
- ".make.*"
- "transforms/.make.transforms"
- "transforms/universal/rep_removal/**"
- "data-processing-lib/**"
- "!transforms/universal/rep_removal/**/kfp_ray/**" # This is/will be tested in separate workflow
- "!data-processing-lib/**/test/**"
- "!data-processing-lib/**/test-data/**"
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"
pull_request:
branches:
- "dev"
- "releases/**"
paths:
- ".make.*"
- "transforms/.make.transforms"
- "transforms/universal/rep_removal/**"
- "data-processing-lib/**"
- "!transforms/universal/rep_removal/**/kfp_ray/**" # This is/will be tested in separate workflow
- "!data-processing-lib/**/test/**"
- "!data-processing-lib/**/test-data/**"
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"

# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
check_if_push_image:
# check whether the Docker images should be pushed to the remote repository
# The images are pushed if it is a merge to dev branch or a new tag is created.
# The latter being part of the release process.
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
runs-on: ubuntu-22.04
outputs:
publish_images: ${{ steps.version.outputs.publish_images }}
steps:
- id: version
run: |
publish_images='false'
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
test-src:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test transform source in transforms/universal/rep_removal
run: |
if [ -e "transforms/universal/rep_removal/Makefile" ]; then
make -C transforms/universal/rep_removal DOCKER=docker test-src
else
echo "transforms/universal/rep_removal/Makefile not found - source testing disabled for this transform."
fi
test-image:
needs: [check_if_push_image]
runs-on: ubuntu-22.04
timeout-minutes: 120
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test transform image in transforms/universal/rep_removal
run: |
if [ -e "transforms/universal/rep_removal/Makefile" ]; then
if [ -d "transforms/universal/rep_removal/spark" ]; then
make -C data-processing-lib/spark DOCKER=docker image
fi
make -C transforms/universal/rep_removal DOCKER=docker test-image
else
echo "transforms/universal/rep_removal/Makefile not found - testing disabled for this transform."
fi
- name: Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
docker images
- name: Publish images
if: needs.check_if_push_image.outputs.publish_images == 'true'
run: |
if [ -e "transforms/universal/rep_removal/Makefile" ]; then
make -C transforms/universal/rep_removal publish
else
echo "transforms/universal/rep_removal/Makefile not found - publishing disabled for this transform."
fi
11 changes: 8 additions & 3 deletions transforms/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"



[tool.setuptools.dynamic.dependencies]
file = ["requirements.txt"]

Expand All @@ -28,7 +29,6 @@ all = { file = [
"code/license_select/python/requirements.txt",
"code/code_quality/python/requirements.txt",
"code/code2parquet/python/requirements.txt",

"code/code_profiler/requirements.txt",

"language/pii_redactor/requirements.txt",
Expand Down Expand Up @@ -57,7 +57,8 @@ all = { file = [
"universal/fdedup/requirements.txt",
"universal/hap/requirements.txt",
"universal/tokenization/requirements.txt",
"universal/web2parquet/requirements.txt"
"universal/web2parquet/requirements.txt",
"universal/rep_removal/requirements.txt"
]}

language = { file = [
Expand Down Expand Up @@ -127,6 +128,7 @@ code_profiler = { file = ["code/code_profiler/requirements.txt"]}

gneissweb_classification = { file = ["language/gneissweb_classification/requirements.txt"]}

rep_removal = { file = ["universal/rep_removal/requirements.txt"]}
# Does not seem to work for our custom layout
# copy all files to a single src and let automatic discovery find them

Expand Down Expand Up @@ -155,10 +157,13 @@ dpk_readability = "language/readability/dpk_readability"
dpk_profiler = "universal/profiler/dpk_profiler"
dpk_resize = "universal/resize/dpk_resize"
dpk_gneissweb_classification = "language/gneissweb_classification/dpk_gneissweb_classification"
dpk_rep_removal = "universal/rep_removal/dpk_rep_removal"


#[tool.setuptools.package-data]
[tool.setuptools.package-data]
#"*" = ["*.txt"]
"dpk_rep_removal.rust" = ["**"]
"dpk_rep_removal.gpt2" = ["**"]

[options]
package_dir = ["src","test"]
Expand Down
39 changes: 39 additions & 0 deletions transforms/universal/rep_removal/Dockerfile.python
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip
RUN apt update && apt install curl -y && apt install gcc -y

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk
ENV HOME="/home/dpk"
ARG DPK_WHEEL_FILE_NAME
ARG TRANSFORM_NAME

# install rust and set path
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="$PATH:$HOME/.cargo/bin"

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:users dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
COPY --chown=dpk:users requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Set environment
ENV PYTHONPATH="/home/dpk"

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
40 changes: 40 additions & 0 deletions transforms/universal/rep_removal/Dockerfile.ray
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
FROM ${BASE_IMAGE}

# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images
USER root
RUN chown ray:root /home/ray && chmod 775 /home/ray

RUN pip install --upgrade --no-cache-dir pip
RUN apt update && apt install curl -y && apt install gcc -y

USER ray

# install pytest
RUN pip install --no-cache-dir pytest
ARG DPK_WHEEL_FILE_NAME
ARG TRANSFORM_NAME

ENV HOME="/home/ray"
# install rust and set path
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="$PATH:$HOME/.cargo/bin"

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]


COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/
COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Set environment
ENV PYTHONPATH="/home/ray"

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
24 changes: 24 additions & 0 deletions transforms/universal/rep_removal/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/transforms/.make.cicd.targets

#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

################################################################################
TRANSFORM_PYTHON_SRC="-m dpk_$(TRANSFORM_NAME).runtime"
TRANSFORM_RAY_SRC="-m dpk_$(TRANSFORM_NAME).ray.runtime"

run-cli-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).runtime \
--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--rep_removal_contents_column_name 'text' \
--rep_removal_num_threads '1'
Loading

0 comments on commit 813c7ca

Please sign in to comment.