diff --git a/examples/notebooks/pdf2parquet_colab-version.ipynb b/examples/notebooks/pdf2parquet_colab-version.ipynb new file mode 100644 index 000000000..af3760b2e --- /dev/null +++ b/examples/notebooks/pdf2parquet_colab-version.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "### Open this notebook in Google Colab" + ] + }, + { + "cell_type": "markdown", + "id": "9f3dbe08", + "metadata": {}, + "source": [ + "Click link to open notebook in google colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf2parquet_colab-version.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "2566f9d9", + "metadata": {}, + "source": [ + "### Create necessary directories for transform on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21aaf3c5", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p 'input/solar-system'\n", + "!wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", + "!wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", + "!wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" + ] + }, + { + "cell_type": "markdown", + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "source": [ + "### Install dependencies for Google Colab environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e76c882", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install data-prep-connector\n", + "! pip install data-prep-toolkit>=0.2.2.dev2\n", + "! pip install 'data-prep-toolkit-transforms[pdf2parquet]>=0.2.2.dev3'" + ] + }, + { + "cell_type": "markdown", + "id": "6632ff81", + "metadata": {}, + "source": [ + "### Restart runtime\n", + "\n", + "After installing dependencies, be sure to restart runtime, so libraries will be loaded.\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)." + ] + }, + { + "cell_type": "markdown", + "id": "89790d59", + "metadata": {}, + "source": [ + "### Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf24fa62", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "029a8c9e", + "metadata": {}, + "source": [ + "### Configure class for managing input and output directories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e184056", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "## Configuration\n", + "class MyConfig:\n", + " pass\n", + "\n", + "MY_CONFIG = MyConfig ()\n", + "\n", + "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", + "\n", + "MY_CONFIG.OUTPUT_FOLDER = \"output\"" + ] + }, + { + "cell_type": "markdown", + "id": "0893534f", + "metadata": {}, + "source": [ + "### Add parent directory to path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0f4cc4", + "metadata": {}, + "outputs": [], + "source": [ + "import os,sys\n", + "\n", + "this_dir = os.path.abspath('')\n", + "parent_dir = os.path.dirname(this_dir)\n", + "sys.path.append (os.path.abspath (parent_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "b16244d2", + "metadata": {}, + "source": [ + "### Configure transform parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c57f10c", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", + "output_folder = MY_CONFIG.OUTPUT_FOLDER\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.pdf','.docx','.pptx','.zip']\"),\n", + " # execution info\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " # pdf2parquet params\n", + " \"pdf2parquet_double_precision\": 0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "### Invoke transform with python runtime" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:13:18 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 0}\n", + "15:13:18 INFO - pipeline id pipeline_id\n", + "15:13:18 INFO - code location None\n", + "15:13:18 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:13:18 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:13:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf', '.docx', '.pptx', '.zip'], files to checkpoint ['.parquet']\n", + "15:13:18 INFO - orchestrator pdf2parquet started at 2024-11-20 15:13:18\n", + "15:13:18 INFO - Number of files is 2, source profile {'max_file_size': 0.3013172149658203, 'min_file_size': 0.2757863998413086, 'total_file_size': 0.5771036148071289}\n", + "15:13:18 INFO - Initializing models\n", + "15:14:08 INFO - Processing archive_doc_filename='2305.03393v1-pg9.pdf' \n", + "15:14:09 INFO - Processing archive_doc_filename='2408.09869v1-pg1.pdf' \n", + "15:14:10 INFO - Completed 1 files (50.0%) in 0.04 min\n", + "15:14:18 INFO - Completed 2 files (100.0%) in 0.179 min\n", + "15:14:18 INFO - Done processing 2 files, waiting for flush() completion.\n", + "15:14:18 INFO - done flushing in 0.0 sec\n", + "15:14:18 INFO - Completed execution in 1.007 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=Pdf2ParquetPythonTransformConfiguration())\n", + "launcher.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "### Print files in output folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef6667e-71ed-4054-9382-55c6bb3fda70", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "output_files = glob.glob('output/*')\n", + "print(output_files)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}