diff --git a/examples/notebooks/pdf2parquet_colab-version.ipynb b/examples/notebooks/pdf2parquet_colab-version.ipynb
new file mode 100644
index 000000000..af3760b2e
--- /dev/null
+++ b/examples/notebooks/pdf2parquet_colab-version.ipynb
@@ -0,0 +1,268 @@
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+ "metadata": {},
+ "source": [
+ "### Open this notebook in Google Colab"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9f3dbe08",
+ "metadata": {},
+ "source": [
+ "Click link to open notebook in google colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf2parquet_colab-version.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2566f9d9",
+ "metadata": {},
+ "source": [
+ "### Create necessary directories for transform on Google Colab"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21aaf3c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p 'input/solar-system'\n",
+ "!wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n",
+ "!wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n",
+ "!wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+ "metadata": {},
+ "source": [
+ "### Install dependencies for Google Colab environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9e76c882",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! pip install data-prep-connector\n",
+ "! pip install data-prep-toolkit>=0.2.2.dev2\n",
+ "! pip install 'data-prep-toolkit-transforms[pdf2parquet]>=0.2.2.dev3'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6632ff81",
+ "metadata": {},
+ "source": [
+ "### Restart runtime\n",
+ "\n",
+ "After installing dependencies, be sure to restart runtime, so libraries will be loaded.\n",
+ "\n",
+ "You do this by going to **`Runtime --> Restart Session`**\n",
+ "\n",
+ "Then you can continue to the next step (no need to re-run the notebook)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "89790d59",
+ "metadata": {},
+ "source": [
+ "### Import required classes and modules"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bf24fa62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ast\n",
+ "import os\n",
+ "import sys\n",
+ "\n",
+ "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+ "from data_processing.utils import ParamsUtils\n",
+ "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "029a8c9e",
+ "metadata": {},
+ "source": [
+ "### Configure class for managing input and output directories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2e184056",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "## Configuration\n",
+ "class MyConfig:\n",
+ " pass\n",
+ "\n",
+ "MY_CONFIG = MyConfig ()\n",
+ "\n",
+ "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n",
+ "\n",
+ "MY_CONFIG.OUTPUT_FOLDER = \"output\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0893534f",
+ "metadata": {},
+ "source": [
+ "### Add parent directory to path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f0f4cc4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os,sys\n",
+ "\n",
+ "this_dir = os.path.abspath('')\n",
+ "parent_dir = os.path.dirname(this_dir)\n",
+ "sys.path.append (os.path.abspath (parent_dir))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b16244d2",
+ "metadata": {},
+ "source": [
+ "### Configure transform parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c57f10c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create parameters\n",
+ "input_folder = MY_CONFIG.INPUT_DATA_DIR\n",
+ "output_folder = MY_CONFIG.OUTPUT_FOLDER\n",
+ "local_conf = {\n",
+ " \"input_folder\": input_folder,\n",
+ " \"output_folder\": output_folder,\n",
+ "}\n",
+ "params = {\n",
+ " # Data access. Only required parameters are specified\n",
+ " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+ " \"data_files_to_use\": ast.literal_eval(\"['.pdf','.docx','.pptx','.zip']\"),\n",
+ " # execution info\n",
+ " \"runtime_pipeline_id\": \"pipeline_id\",\n",
+ " \"runtime_job_id\": \"job_id\",\n",
+ " # pdf2parquet params\n",
+ " \"pdf2parquet_double_precision\": 0,\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+ "metadata": {},
+ "source": [
+ "### Invoke transform with python runtime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0775e400-7469-49a6-8998-bd4772931459",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "15:13:18 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 0}\n",
+ "15:13:18 INFO - pipeline id pipeline_id\n",
+ "15:13:18 INFO - code location None\n",
+ "15:13:18 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n",
+ "15:13:18 INFO - data factory data_ max_files -1, n_sample -1\n",
+ "15:13:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf', '.docx', '.pptx', '.zip'], files to checkpoint ['.parquet']\n",
+ "15:13:18 INFO - orchestrator pdf2parquet started at 2024-11-20 15:13:18\n",
+ "15:13:18 INFO - Number of files is 2, source profile {'max_file_size': 0.3013172149658203, 'min_file_size': 0.2757863998413086, 'total_file_size': 0.5771036148071289}\n",
+ "15:13:18 INFO - Initializing models\n",
+ "15:14:08 INFO - Processing archive_doc_filename='2305.03393v1-pg9.pdf' \n",
+ "15:14:09 INFO - Processing archive_doc_filename='2408.09869v1-pg1.pdf' \n",
+ "15:14:10 INFO - Completed 1 files (50.0%) in 0.04 min\n",
+ "15:14:18 INFO - Completed 2 files (100.0%) in 0.179 min\n",
+ "15:14:18 INFO - Done processing 2 files, waiting for flush() completion.\n",
+ "15:14:18 INFO - done flushing in 0.0 sec\n",
+ "15:14:18 INFO - Completed execution in 1.007 min, execution result 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%capture\n",
+ "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+ "launcher = PythonTransformLauncher(runtime_config=Pdf2ParquetPythonTransformConfiguration())\n",
+ "launcher.launch()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+ "metadata": {},
+ "source": [
+ "### Print files in output folder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fef6667e-71ed-4054-9382-55c6bb3fda70",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import glob\n",
+ "output_files = glob.glob('output/*')\n",
+ "print(output_files)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5