data-prep-kit · touma-I · Mar 6, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/.github/workflows/test-universal-tokenization2arrow.yml b/.github/workflows/test-universal-tokenization2arrow.yml
@@ -0,0 +1,133 @@
+#
+# DO NOT EDIT THIS FILE: it is generated from test-transform.template,  Edit there and run make to change these files
+#
+name: Test - transforms/universal/tokenization2arrow
+
+on:
+    workflow_dispatch:
+    push:
+        branches:
+            - "dev"
+            - "releases/**"
+        tags:
+            - "*"
+        paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
+            - "transforms/universal/tokenization2arrow/**"
+            - "data-processing-lib/**"
+            - "!transforms/universal/tokenization2arrow/**/kfp_ray/**" # This is/will be tested in separate workflow
+            - "!data-processing-lib/**/test/**"
+            - "!data-processing-lib/**/test-data/**"
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+    pull_request:
+        branches:
+            - "dev"
+            - "releases/**"
+        paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
+            - "transforms/universal/tokenization2arrow/**"
+            - "data-processing-lib/**"
+            - "!transforms/universal/tokenization2arrow/**/kfp_ray/**" # This is/will be tested in separate workflow
+            - "!data-processing-lib/**/test/**"
+            - "!data-processing-lib/**/test-data/**"
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
+jobs:
+    check_if_push_image:
+        # check whether the Docker images should be pushed to the remote repository
+        # The images are pushed if it is a merge to dev branch or a new tag is created.
+        # The latter being part of the release process.
+        # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
+        runs-on: ubuntu-22.04
+        outputs:
+            publish_images: ${{ steps.version.outputs.publish_images }}
+        steps:
+            - id: version
+              run: |
+                  publish_images='false'
+                  if  [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
+                  then
+                    publish_images='true'
+                  fi
+                  if  [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
+                  then
+                    publish_images='true'
+                  fi
+                  echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
+    test-src:
+        runs-on: ubuntu-22.04
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test transform source in transforms/universal/tokenization2arrow
+              run: |
+                  if [ -e "transforms/universal/tokenization2arrow/Makefile" ]; then
+                      make -C transforms/universal/tokenization2arrow DOCKER=docker test-src
+                  else
+                      echo "transforms/universal/tokenization2arrow/Makefile not found - source testing disabled for this transform."
+                  fi
+    test-image:
+        needs: [check_if_push_image]
+        runs-on: ubuntu-22.04
+        timeout-minutes: 120
+        env:
+            DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
+            DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf /opt/ghc
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test transform image in transforms/universal/tokenization2arrow
+              run: |
+                  if [ -e "transforms/universal/tokenization2arrow/Makefile" ]; then
+                      if [ -d "transforms/universal/tokenization2arrow/spark" ]; then
+                          make -C data-processing-lib/spark DOCKER=docker image
+                      fi
+                      make -C transforms/universal/tokenization2arrow DOCKER=docker test-image
+                  else
+                      echo "transforms/universal/tokenization2arrow/Makefile not found - testing disabled for this transform."
+                  fi
+            - name: Print space
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  docker images
+            - name: Publish images
+              if: needs.check_if_push_image.outputs.publish_images == 'true'
+              run: |
+                  if [ -e "transforms/universal/tokenization2arrow/Makefile" ]; then
+                      make -C transforms/universal/tokenization2arrow publish
+                  else
+                      echo "transforms/universal/tokenization2arrow/Makefile not found - publishing disabled for this transform."
+                  fi
diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
@@ -159,6 +159,7 @@ dpk_profiler = "universal/profiler/dpk_profiler"
 dpk_resize = "universal/resize/dpk_resize"
 dpk_gneissweb_classification = "language/gneissweb_classification/dpk_gneissweb_classification"
 dpk_rep_removal = "universal/rep_removal/dpk_rep_removal"
+dpk_tokenization2arrow = "universal/tokenization2arrow/dpk_tokenization2arrow"
 
 
 [tool.setuptools.package-data]

diff --git a/transforms/universal/tokenization2arrow/Dockerfile.python b/transforms/universal/tokenization2arrow/Dockerfile.python
@@ -0,0 +1,34 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+ARG DPK_WHEEL_FILE_NAME
+ARG TRANSFORM_NAME
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-dist data-processing-dist
+RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
+
+# END OF STEPS destined for a data-prep-kit base image 
+
+COPY --chown=dpk:root dpk_tokenization2arrow/ dpk_tokenization2arrow/
+COPY --chown=dpk:root requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r  requirements.txt
+
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/tokenization2arrow/Dockerfile.ray b/transforms/universal/tokenization2arrow/Dockerfile.ray
@@ -0,0 +1,33 @@
+ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
+
+FROM ${BASE_IMAGE}
+
+# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images
+USER root
+RUN chown ray:root /home/ray && chmod g=u /home/ray
+USER ray
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# Install pytest so we can test the image later
+RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist
+RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
+
+
+COPY --chmod=775 --chown=ray:root dpk_tokenization2arrow/ dpk_tokenization2arrow/
+COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Set environment
+ENV PYTHONPATH /home/ray
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/tokenization2arrow/Makefile b/transforms/universal/tokenization2arrow/Makefile
@@ -0,0 +1,35 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/transforms/.make.cicd.targets
+
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
+
+################################################################################
+
+TRANSFORM_PYTHON_SRC="-m dpk_$(TRANSFORM_NAME).runtime"
+TRANSFORM_RAY_SRC="-m dpk_$(TRANSFORM_NAME).ray.runtime"
+
+
+run-cli-sample-python:
+	# TODO: set env variable HF_TOKEN to download tokenizer from HF
+	make venv
+	source venv/bin/activate && \
+	rm -rf output/ds02 && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).runtime \
+            --data_local_config "{ 'input_folder' : 'test-data/ds02/input', 'output_folder' : 'output/ds02'}"
+
+run-cli-sample-ray:
+	# TODO: set env variable HF_TOKEN to download tokenizer from HF
+	make venv
+	source venv/bin/activate && \
+	rm -rf output/ds02 && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.runtime \
+            --data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output/ds01'}" \
+			--run_locally True
diff --git a/transforms/universal/tokenization2arrow/README.md b/transforms/universal/tokenization2arrow/README.md
@@ -0,0 +1,135 @@
+# Tokenization2Arrow Transform 
+
+Please see the set of
+[transform project conventions](../../README.md#transform-project-conventions)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Contributors
+
+- Santosh Borse ([email protected])
+
+## Summary
+
+<p align="Left"> Distributed tokenization module for data sets using any Hugging Face compatible tokenizer. This Tokenizer is built upon existing [DPK Tokenizer](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization)
+    <br>
+
+For every input .parquet file it generates .arrow and 2 metadata files ( in meta folder),
+- .arrow file - contains actual tokens
+- .docs file -  contains 1 line summary of file with content,
+
+        [full file path], documents: [total document count], tokens: [total token count]
+
+- .doc.ids - contains details of token count for every document of file, contents looks like,
+
+        [document id], [document's token counts]
+
+</p>
+
+The data tokenization transform operates by converting a (non-empty) input table into an output table 
+using a pre-trained tokenizer. The input table is required to have a minimum of two columns, 
+named `document_id` and `contents` by default. However, alternate column names can be specified using 
+`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
+It is essential for the values within the `document_id` column to be unique across the dataset, 
+while the `contents` column stores their respective document content. To execute example demonstrations within this directory, 
+a machine with `64GB` of RAM is recommended.
+
+To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. 
+This parameter accepts the name of a tokenizer ready for download from Hugging Face, 
+such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible 
+with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter 
+to include extra arguments specific to the chosen tokenizer. 
+For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, 
+you can specify `use_auth_token=<your token>` in `--tkn_tokenizer`. 
+
+The tokenization transformer utilizes the specified tokenizer to tokenize each row, 
+assuming each row represents a document, in the input table and save it to a corresponding row in the output table. 
+The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.
+
+The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. 
+The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, 
+while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
+During tokenization, the tokenizer will disregard empty documents (rows) in the input table, 
+as well as documents that yield no tokens or encounter failure during tokenization. 
+The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.
+
+
+In certain cases, the tokenization process of some tokenizers may be sluggish, 
+particularly when handling lengthy documents containing millions of characters. 
+To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
+For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. 
+The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
+By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.
+
+
+
+## Running
+
+### CLI Options
+The following command line arguments are available in addition to 
+the options provided by the [launcher](../../../data-processing-lib/doc/launcher-options.md).
+```
+  --tkn_tokenizer TKN_TOKENIZER
+                        Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
+  --tkn_tokenizer_args TKN_TOKENIZER_ARGS
+                        Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
+  --tkn_doc_id_column TKN_DOC_ID_COLUMN
+                        Column contains document id which values should be unique across dataset
+  --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
+                        Column contains document content
+  --tkn_text_lang TKN_TEXT_LANG
+                        Specify language used in the text content for better text splitting if needed
+  --tkn_chunk_size TKN_CHUNK_SIZE
+                        Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
+```
+
+### Running the samples
+To run the samples, use one of the following `make` targets:
+
+* `run-cli-sample-python` - runs dpk_tokenization2arrow using python runtime
+
+or 
+
+* `run-cli-sample-ray` - runs dpk_tokenization2arrow using ray runtime
+
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-cli-sample-python
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+Here is a sample [notebook](tokenization2arrow.ipynb)
+
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+# Tokenization2Arrow Transform for Ray
+
+## Summary 
+This project wraps the tokenization2arrow transform with a Ray runtime.
+
+## Configuration and command line Options
+
+Configuration and command line options are the same as for the base python transform. 
+
+### Launched Command Line Options 
+In addition to those available to the transform as defined in here,
+the set of 
+[launcher options](../../../data-processing-lib/doc/launcher-options.md) are available.
+
+### Code example
+Here is a sample [notebook](tokenization2arrow-ray.ipynb) that uses ray runtime.