diff --git a/examples/data-files/pdf-processing-1/README.md b/examples/data-files/pdf-processing-1/README.md new file mode 100644 index 000000000..e81e80ee8 --- /dev/null +++ b/examples/data-files/pdf-processing-1/README.md @@ -0,0 +1,11 @@ +## Creating Input PDFs (Optional) + +Sample PDFs we use for this example are created from markdown documents using pandoc utility, as follows. + +```bash +pandoc earth.md -o earth.pdf +pandoc earth2.md -o earth2.pdf +pandoc mars.md -o mars.pdf +pandoc spam.md -o spam.pdf +pandoc lorem-ipsum.md -o lorem-ipsum.pdf +``` \ No newline at end of file diff --git a/examples/data-files/pdf-processing-1/earth-copy.pdf b/examples/data-files/pdf-processing-1/earth-copy.pdf new file mode 100644 index 000000000..9a775a998 Binary files /dev/null and b/examples/data-files/pdf-processing-1/earth-copy.pdf differ diff --git a/examples/notebooks/intro/input/solar-system/earth.md b/examples/data-files/pdf-processing-1/earth.md similarity index 100% rename from examples/notebooks/intro/input/solar-system/earth.md rename to examples/data-files/pdf-processing-1/earth.md diff --git a/examples/notebooks/intro/input/solar-system/earth.pdf b/examples/data-files/pdf-processing-1/earth.pdf similarity index 99% rename from examples/notebooks/intro/input/solar-system/earth.pdf rename to examples/data-files/pdf-processing-1/earth.pdf index b6bc7edc8..9a775a998 100644 Binary files a/examples/notebooks/intro/input/solar-system/earth.pdf and b/examples/data-files/pdf-processing-1/earth.pdf differ diff --git a/examples/data-files/pdf-processing-1/earth2.md b/examples/data-files/pdf-processing-1/earth2.md new file mode 100644 index 000000000..04f4eb6c3 --- /dev/null +++ b/examples/data-files/pdf-processing-1/earth2.md @@ -0,0 +1,18 @@ +# Earth + + +## Solar System + +Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun. + +For more details about the Solar system see Chapter 1. + +## Earth + +Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life. + +Basic facts about Earth: + +- Distance from the Sun: Average of 149.6 million kilometers (93 million miles) +- Rotation Period: 24 hours (one day) +- Moons: One moon, called Luna or simply "the Moon". \ No newline at end of file diff --git a/examples/data-files/pdf-processing-1/earth2.pdf b/examples/data-files/pdf-processing-1/earth2.pdf new file mode 100644 index 000000000..5c024886a Binary files /dev/null and b/examples/data-files/pdf-processing-1/earth2.pdf differ diff --git a/examples/data-files/pdf-processing-1/lorem-ipsum.md b/examples/data-files/pdf-processing-1/lorem-ipsum.md new file mode 100644 index 000000000..35723ccaa --- /dev/null +++ b/examples/data-files/pdf-processing-1/lorem-ipsum.md @@ -0,0 +1,3 @@ +Lorem ipsum +Lorem ipsum +Lorem ipsum \ No newline at end of file diff --git a/examples/data-files/pdf-processing-1/lorem-ipsum.pdf b/examples/data-files/pdf-processing-1/lorem-ipsum.pdf new file mode 100644 index 000000000..b2807a44d Binary files /dev/null and b/examples/data-files/pdf-processing-1/lorem-ipsum.pdf differ diff --git a/examples/notebooks/intro/input/solar-system/mars.md b/examples/data-files/pdf-processing-1/mars.md similarity index 100% rename from examples/notebooks/intro/input/solar-system/mars.md rename to examples/data-files/pdf-processing-1/mars.md diff --git a/examples/notebooks/intro/input/solar-system/mars.pdf b/examples/data-files/pdf-processing-1/mars.pdf similarity index 99% rename from examples/notebooks/intro/input/solar-system/mars.pdf rename to examples/data-files/pdf-processing-1/mars.pdf index a48c4365b..5e464d870 100644 Binary files a/examples/notebooks/intro/input/solar-system/mars.pdf and b/examples/data-files/pdf-processing-1/mars.pdf differ diff --git a/examples/data-files/pdf-processing-1/spam.md b/examples/data-files/pdf-processing-1/spam.md new file mode 100644 index 000000000..e5526cbad --- /dev/null +++ b/examples/data-files/pdf-processing-1/spam.md @@ -0,0 +1 @@ +Free xxx \ No newline at end of file diff --git a/examples/data-files/pdf-processing-1/spam.pdf b/examples/data-files/pdf-processing-1/spam.pdf new file mode 100644 index 000000000..43999b8ac Binary files /dev/null and b/examples/data-files/pdf-processing-1/spam.pdf differ diff --git a/examples/notebooks/intro/README.md b/examples/notebooks/intro/README.md deleted file mode 100644 index 77a80865b..000000000 --- a/examples/notebooks/intro/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Data Prep Kit Introduction - -This is an example featuring some of the features of data prep kit. - -## Running the code - -The code can be run on either - -1. Google colab: very easy to run; no local setup needed. -2. On your local Python environment. Here is a quick guide. You can find instructions for latest version [here](../../../README.md#-getting-started) - -```bash -conda create -n data-prep-kit -y python=3.11 -conda activate data-prep-kit - -# install the following in 'data-prep-kit' environment -pip3 install data-prep-toolkit==0.2.1 -pip3 install data-prep-toolkit-transforms==0.2.1 -pip3 install data-prep-toolkit-transforms-ray==0.2.1 -pip3 install jupyterlab ipykernel ipywidgets - -## install custom kernel -## Important: Use this kernel when running example notebooks! -python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit" - -# start jupyter and run the notebooks with this jupyter -jupyter lab -``` - -## Intro - -This notebook will demonstrate processing PDFs - -`PDFs ---> text ---> chunks ---> exact dedupe ---> fuzzy dedupe ---> embeddings` - -[python version](dpk_intro_1_python.ipynb)   |   [ray version](dpk_intro_1_ray.ipynb) diff --git a/examples/notebooks/intro/dpk_intro_1_python.ipynb b/examples/notebooks/intro/dpk_intro_1_python.ipynb deleted file mode 100644 index ab7cda854..000000000 --- a/examples/notebooks/intro/dpk_intro_1_python.ipynb +++ /dev/null @@ -1,3667 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", - "metadata": { - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" - }, - "source": [ - "# Data Prep Kit Demo 1 - Python version\n", - "\n", - "This notebook will introduce DPK and showcase some of it's capabilities.\n", - "\n", - "Here is the workflow\n", - "\n", - "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" - ] - }, - { - "cell_type": "markdown", - "id": "b15976e3", - "metadata": { - "id": "b15976e3" - }, - "source": [ - "## How to run this notebook\n", - "\n", - "Two options:\n", - "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", - "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", - "\n", - "The notebook will work as in both environments" - ] - }, - { - "cell_type": "markdown", - "id": "eb8b0d5c", - "metadata": { - "id": "eb8b0d5c" - }, - "source": [ - "## Step-1: Inspect the Data\n", - "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", - "\n", - "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, - "source": [ - "## Step-2: Figure out Runtime Environment\n", - "\n", - "### 2.1 - Determine runtime\n", - "\n", - "Determine if we are running on Google colab or local python environment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1fe354b7", - "outputId": "5c153f72-08ed-4d6e-ccc7-dae851e7fd8b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "8e7c104b", - "metadata": { - "id": "8e7c104b" - }, - "source": [ - "### 2.2 -Download Data if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3309799e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3309799e", - "outputId": "99530315-6dd5-405d-dbde-61e2332e441b" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 2.3 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1fcec577", - "outputId": "0f77fc39-ffeb-48da-ce6f-1750d8d3ad62" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit==0.2.1 \\\n", - " data-prep-toolkit-transforms==0.2.1 \\\n", - " deepsearch-toolkit\n" - ] - }, - { - "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, - "source": [ - "### 2.4 - Restart Runtime\n", - "\n", - "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", - "\n", - "You do this by going to **`Runtime --> Restart Session`**\n", - "\n", - "Then you can continue to the next step (no need to re-run the notebook)" - ] - }, - { - "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration" - ] - }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, - "source": [ - "### 2.1 - Basic Config" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e4YMZrBuFycl", - "outputId": "d7ee9449-4f21-4c9a-fa54-14b7f28d764a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "33345487", - "metadata": { - "id": "33345487" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", - "\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b15e6827", - "metadata": { - "id": "b15e6827" - }, - "outputs": [], - "source": [ - "## Add parent dir to path\n", - "import os,sys\n", - "\n", - "this_dir = os.path.abspath('')\n", - "parent_dir = os.path.dirname(this_dir)\n", - "sys.path.append (os.path.abspath (parent_dir))" - ] - }, - { - "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, - "source": [ - "### 2.2 - Setup input/outpur directories" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "4d5511fb-1c6f-47df-e5ea-2c1b354d262f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Cleared output directory\n" - ] - } - ], - "source": [ - "import os, sys\n", - "import shutil\n", - "\n", - "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", - " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", - "\n", - "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", - "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", - "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", - "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_embeddings_out')\n", - "\n", - "## clear output folder\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", - "\n", - "print (\"โœ… Cleared output directory\")" - ] - }, - { - "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, - "source": [ - "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", - "\n", - "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", - "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", - "metadata": { - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" - }, - "source": [ - "### 3.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "outputId": "c50847d4-f2c7-4559-f5f7-d6a3d025027d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" - ] - } - ], - "source": [ - "STAGE = 1\n", - "\n", - "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = output_parquet_dir\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 3.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 657, - "referenced_widgets": [ - "97b603697cfa4b4ea4e6735b6768ca35", - "e87e8d3262c54cfaaa8768505edacda3", - "b78aa40816e44f7fbebcb24ca68818b3", - "7053c9606a414e978636a7e241909504", - "da0787b239764847a731083997780a85", - "553f3c16839a49d79591d0fc4862bed6", - "c0eb5bc8f6ee427ca42204b3c56f9a4e", - "9d184ed175f0403fb03c2e13dfd04e0a", - "724778729161445c98b187031ae4f67c", - "1cb3bbf7d724411cbe9831543a4aecc0", - "06f9b33494984e4885d5aad813d1d2bc" - ] - }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "01d207fb-983d-40b2-e5f6-e38e3789110a" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:39 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", - "13:34:39 INFO - pipeline id pipeline_id\n", - "13:34:39 INFO - code location None\n", - "13:34:39 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", - "13:34:39 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:39 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "13:34:39 INFO - orchestrator pdf2parquet started at 2024-10-18 13:34:39\n", - "13:34:39 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", - "13:34:39 INFO - Initializing models\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "750f3b6951094b2eb68490c7f5f98148", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 10 files: 0%| | 0/10 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...10116e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011efbdbcb9-f0af-42f0-b191-2f14ce3ddc7cpdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdf
\n", - "" - ], - "text/plain": [ - " filename contents num_pages \\\n", - "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "\n", - " num_tables num_doc_elements document_id ext \\\n", - "0 0 11 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 pdf \n", - "1 0 11 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:43.410297 0.794765 earth.pdf " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(5)\n", - "\n", - "## To display certain columns\n", - "#parquet_df[['column1', 'column2', 'column3']].head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, - "source": [ - "\n", - "### 3.4 - Understand the output\n", - "\n", - "Here are some interesting attributes to note:\n", - "\n", - "- **filename** : original filename\n", - "- **contents** : text\n", - "- **document_id**: unique id (UUID) assignd to this document\n", - "- **hash** : hash of document\n", - "- **pdf_convert_time** : time to convert this pdf in seconds\n", - "\n", - "Let's inspect the **contents** column. See how the text is being divided up!" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f870e624", - "outputId": "0b4c054f-3a8a-4db3-f32f-17bd1466b102" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", - " 'filename': 'mars.pdf',\n", - " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.35137939,\n", - " 654.45184326,\n", - " 169.88169861,\n", - " 667.98492432],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.09541321,\n", - " 630.68127441,\n", - " 210.66503906,\n", - " 642.34405518],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.84518433,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.02520752],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.18510437,\n", - " 570.83258057,\n", - " 374.99838257,\n", - " 581.07043457],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about the Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.22866821,\n", - " 542.98168945,\n", - " 163.86282349,\n", - " 554.45288086],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87440491,\n", - " 500.84011841,\n", - " 477.48345947,\n", - " 534.55810547],\n", - " 'page': 1,\n", - " 'span': [0, 196]}],\n", - " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", - " 'desert world with a thin atmosphere composed '\n", - " 'primarily of carbon dioxide. Its reddish hue comes '\n", - " 'from iron oxide, or rust, prevalent on its surface.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.2026062,\n", - " 482.90710449,\n", - " 237.04431152,\n", - " 493.07443237],\n", - " 'page': 1,\n", - " 'span': [0, 23]}],\n", - " 'text': 'Basic facts about Mars:',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 453.019104,\n", - " 477.48171997,\n", - " 474.9703064],\n", - " 'page': 1,\n", - " 'span': [0, 78]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", - " 'kilometers (142 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.79351807,\n", - " 431.73287964,\n", - " 451.2142334],\n", - " 'page': 1,\n", - " 'span': [0, 64]}],\n", - " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", - " 'called a \"sol\")',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 429.10913086,\n", - " 365.9559021,\n", - " 438.83737183],\n", - " 'page': 1,\n", - " 'span': [0, 44]}],\n", - " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.51646423],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "import pprint\n", - "import json\n", - "\n", - "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", - "# json.loads(output_df.iloc[0, ]['contents'])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e1a10c2d", - "outputId": "c1d992c2-faa8-40cd-c375-857970201daa" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", - " 'filename': 'earth.pdf',\n", - " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.30961609,\n", - " 654.45184326,\n", - " 174.04208374,\n", - " 667.93347168],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.12528992,\n", - " 630.69073486,\n", - " 210.66503906,\n", - " 642.27935791],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87112427,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.04595947],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.20942688,\n", - " 570.81555176,\n", - " 375.57919312,\n", - " 581.08459473],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about our Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.15542603,\n", - " 542.98168945,\n", - " 167.32983398,\n", - " 554.36669922],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.91053772,\n", - " 512.46295166,\n", - " 477.84887695,\n", - " 534.48431396],\n", - " 'page': 1,\n", - " 'span': [0, 107]}],\n", - " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", - " 'planet. Earth is the only place we know of with life.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.30151367,\n", - " 494.86206055,\n", - " 240.17156982,\n", - " 505.07229614],\n", - " 'page': 1,\n", - " 'span': [0, 24]}],\n", - " 'text': 'Basic facts about Earth:',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 464.97409058,\n", - " 477.47979736,\n", - " 487.02810669],\n", - " 'page': 1,\n", - " 'span': [0, 79]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", - " 'kilometers (93 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 452.86901855,\n", - " 317.90722656,\n", - " 463.24041748],\n", - " 'page': 1,\n", - " 'span': [0, 37]}],\n", - " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.71496582,\n", - " 396.66357422,\n", - " 451.19915771],\n", - " 'page': 1,\n", - " 'span': [0, 52]}],\n", - " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.53633118],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" - ] - }, - { - "cell_type": "markdown", - "id": "72274586", - "metadata": { - "id": "72274586" - }, - "source": [ - "## Step-4: Doc chunks\n", - "\n", - "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", - "\n", - "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", - "\n", - "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", - "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", - "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", - "which provides the required JSON structure." - ] - }, - { - "cell_type": "markdown", - "id": "96198fa6", - "metadata": { - "id": "96198fa6" - }, - "source": [ - "### 4.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "305f00a3", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "305f00a3", - "outputId": "dd511f34-bab3-4dde-d938-493debb02e5e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" - ] - } - ], - "source": [ - "STAGE = 2\n", - "\n", - "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_chunk_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "369f2cd1", - "metadata": { - "id": "369f2cd1" - }, - "source": [ - "### 4.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5b7b18d5", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5b7b18d5", - "outputId": "e0b87171-9d66-473f-e66a-e4b6ae3c3f66" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator doc_chunk started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", - "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:45 INFO - done flushing in 0.0 sec\n", - "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:2 completed successfully\n", - "CPU times: user 826 ms, sys: 101 ms, total: 928 ms\n", - "Wall time: 923 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # doc_chunk arguments\n", - " # ...\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = PythonTransformLauncher(DocChunkPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "213afdf6", - "metadata": { - "id": "213afdf6" - }, - "source": [ - "### 4.3 - Inspect Generated output\n", - "\n", - "We would see documents are split into many chunks" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d8138d43", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 897 - }, - "id": "d8138d43", - "outputId": "fd01e0cb-899e-4c73-d50e-5f4e6f5ff802" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Files processed : 2\n", - "Chunks created : 8\n", - "Input data dimensions (rows x columns)= (2, 12)\n", - "Output data dimensions (rows x columns)= (8, 16)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "7 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "7 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (f\"Files processed : {input_df.shape[0]:,}\")\n", - "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "9e9ca75c", - "metadata": { - "id": "9e9ca75c" - }, - "source": [ - "### 4.4 - Understanding the Output\n", - "\n", - "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", - "\n", - "See how **document_id** is carried throughout. This helps us identify original documents.\n", - "\n", - "Also note **contents** is now plain text (not JSON as before)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3090c950", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "3090c950", - "outputId": "0f4b6771-8d38-4a27-c756-21f916b23a4f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Solar System\\nFor more details about the Solar...\n", - "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "5 earth.pdf Solar System\\nFor more details about our Solar...\n", - "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d5f151ae", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d5f151ae", - "outputId": "a4c491b2-53db-4d71-da24-4479de8d1d65" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 3------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "7ad1c60d", - "metadata": { - "id": "7ad1c60d" - }, - "source": [ - "## Step-5: DOC ID generation of Chunks\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", - "\n", - "**This is a pre-requisite for fuzzy dedup** in the pipeline." - ] - }, - { - "cell_type": "markdown", - "id": "1afaa0fd", - "metadata": { - "id": "1afaa0fd" - }, - "source": [ - "### 5.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "6ffd6f54", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6ffd6f54", - "outputId": "1784c80d-6309-4913-9f55-c018b978968f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" - ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE = 3\n", - "\n", - "input_folder = output_chunk_dir\n", - "output_folder = output_docid_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "f78a51b7", - "metadata": { - "id": "f78a51b7" - }, - "source": [ - "### 5.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "5fc77557", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5fc77557", - "outputId": "db2b8670-543e-4073-9c7d-3f9ef5f4317e" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator doc_id started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", - "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:45 INFO - done flushing in 0.0 sec\n", - "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:3 completed successfully\n", - "CPU times: user 12.8 ms, sys: 3.7 ms, total: 16.5 ms\n", - "Wall time: 13.1 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " # doc id configuration\n", - " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"chunk_hash\",\n", - " \"doc_id_int_column\": \"chunk_id\",\n", - "}\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "a9a8c1fa", - "metadata": { - "id": "a9a8c1fa" - }, - "source": [ - "### 5.3 - Inspect Generated output\n", - "\n", - "You will notice we have two extra columns\n", - "\n", - "- **hash_column**\n", - "- **int_id_column**\n", - "\n", - "But still the same number or rows as before" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "da9adede", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 860 - }, - "id": "da9adede", - "outputId": "036db4ca-12f6-4b3e-9d7f-fa70e494870d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 16)\n", - "Output data dimensions (rows x columns)= (8, 18)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "7 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "7 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53", - "metadata": { - "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53" - }, - "source": [ - "## Step-6: Exact Dedup\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", - "metadata": { - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" - }, - "source": [ - "### 6.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "4c7a1b94", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4c7a1b94", - "outputId": "2f6f05bc-f6fd-4d66-ea01-ed89cd5b80f3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" - ] - } - ], - "source": [ - "STAGE = 4\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_exact_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", - "metadata": { - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" - }, - "source": [ - "### 6.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "outputId": "74dc0b75-58b5-4c97-9965-91315e8a98a5" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator ededup started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "13:34:45 INFO - Starting from the beginning\n", - "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:45 INFO - done flushing in 0.0 sec\n", - "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:4 completed successfully\n", - "CPU times: user 17.6 ms, sys: 997 ฮผs, total: 18.6 ms\n", - "Wall time: 15.2 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # ededup parameters\n", - " \"ededup_doc_column\": \"contents\",\n", - " \"ededup_doc_id_column\": \"chunk_hash\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": { - "id": "eaf1c3c3" - }, - "source": [ - "### 6.3 - Inspect Generated output" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "d824ebf6", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 815 - }, - "id": "d824ebf6", - "outputId": "68f55770-c750-4607-a205-ba183603019d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (7, 19)\n", - "Input chunks before exact dedupe : 8\n", - "Output chunks after exact dedupe : 7\n", - "Duplicate chunks removed : 1\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "82cc9bb0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 269 - }, - "id": "82cc9bb0", - "outputId": "46d9e91d-c470-4e3e-e5c8-508c534dbceb" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nFor more details about the Solar...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "4 earth.pdf Solar System\\nFor more details about our Solar...\n", - "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "cc61dffa", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cc61dffa", - "outputId": "7fb26043-8538-48b6-80b7-16ceb818c1a8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "383f40ba", - "metadata": { - "id": "383f40ba" - }, - "source": [ - "### 6.4 - Understanding the output\n", - "\n", - "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", - "\n", - "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", - "\n", - "```text\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", - "metadata": { - "id": "85309751-8556-41c6-ac32-84acc941bc8d" - }, - "source": [ - " ## Step-7: Fuzzy Dedup\n", - "\n", - "And fuzzy dedupe is only available in RAY version. So we will skip it here\n", - "\n", - "See this file [dpk_intro_1_ray.ipynb](dpk_intro_1_ray.ipynb)" - ] - }, - { - "cell_type": "markdown", - "id": "5370950a-2a3a-4143-8218-f9b4808099ba", - "metadata": { - "id": "5370950a-2a3a-4143-8218-f9b4808099ba" - }, - "source": [ - "## Step-8: Text encoding\n", - "\n", - "Encode text for the vector storage." - ] - }, - { - "cell_type": "markdown", - "id": "85aba685", - "metadata": { - "id": "85aba685" - }, - "source": [ - "### 8.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "outputId": "41d268f5-7cc6-432e-d56e-2ba882fbdba6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/04_exact_dedupe_out' --> output='output/05_embeddings_out'\n" - ] - } - ], - "source": [ - "STAGE = 6\n", - "\n", - "input_folder = output_exact_dedupe_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_embeddings_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "c97545f4", - "metadata": { - "id": "c97545f4" - }, - "source": [ - "### 8.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "outputId": "b2119b07-0654-45cd-f729-1396e18b24b1" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_embeddings_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator text_encoder started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.010450363159179688, 'min_file_size': 0.010318756103515625, 'total_file_size': 0.020769119262695312}\n", - "13:34:47 INFO - Completed 1 files (50.0%) in 0.004 min\n", - "13:34:47 INFO - Completed 2 files (100.0%) in 0.005 min\n", - "13:34:47 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:47 INFO - done flushing in 0.0 sec\n", - "13:34:47 INFO - Completed execution in 0.034 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:6 completed successfully\n", - "CPU times: user 615 ms, sys: 146 ms, total: 761 ms\n", - "Wall time: 2.24 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from text_encoder_local_python import TextEncoderPythonTransformConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # text_encoder\n", - " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", - "}\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "# create launcher\n", - "launcher = PythonTransformLauncher(TextEncoderPythonTransformConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "b734852c", - "metadata": { - "id": "b734852c" - }, - "source": [ - "### 8.3 - Inspect Generated output\n", - "\n", - "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "7b1c1d09", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 760 - }, - "id": "7b1c1d09", - "outputId": "018daa18-e5db-4483-d8d5-30aded80d5e3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (7, 19)\n", - "Output data dimensions (rows x columns)= (7, 20)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremovedembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...[-0.051861435, 0.0035226212, 0.030617002, 0.04...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[][0.07728295, 0.024970993, -0.043180738, 0.0580...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[][0.10598018, 0.025460618, 0.023627337, 0.03905...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[][0.0077404436, -0.02055944, 0.026426593, 0.011...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[][-0.062105548, -0.0053322907, 0.031277698, 0.0...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[][0.072435796, -0.058001805, -0.019771898, -0.0...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[][0.091821924, 0.015197902, 0.07716932, 0.01711...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \\\n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] \n", - "\n", - " embeddings \n", - "0 [-0.051861435, 0.0035226212, 0.030617002, 0.04... \n", - "1 [0.07728295, 0.024970993, -0.043180738, 0.0580... \n", - "2 [0.10598018, 0.025460618, 0.023627337, 0.03905... \n", - "3 [0.0077404436, -0.02055944, 0.026426593, 0.011... \n", - "4 [-0.062105548, -0.0053322907, 0.031277698, 0.0... \n", - "5 [0.072435796, -0.058001805, -0.019771898, -0.0... \n", - "6 [0.091821924, 0.015197902, 0.07716932, 0.01711... " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "31f09b58-7b2d-48bb-9dac-bc0ba9625c01" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Copied output from 'output/05_embeddings_out' --> 'output/output_final'\n" - ] - } - ], - "source": [ - "import shutil\n", - "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", - "\n", - "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "dpk-2-basic-021-py311", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "06f9b33494984e4885d5aad813d1d2bc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1cb3bbf7d724411cbe9831543a4aecc0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "553f3c16839a49d79591d0fc4862bed6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7053c9606a414e978636a7e241909504": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1cb3bbf7d724411cbe9831543a4aecc0", - "placeholder": "โ€‹", - "style": "IPY_MODEL_06f9b33494984e4885d5aad813d1d2bc", - "value": "โ€‡10/10โ€‡[00:00<00:00,โ€‡349.38it/s]" - } - }, - "724778729161445c98b187031ae4f67c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "97b603697cfa4b4ea4e6735b6768ca35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e87e8d3262c54cfaaa8768505edacda3", - "IPY_MODEL_b78aa40816e44f7fbebcb24ca68818b3", - "IPY_MODEL_7053c9606a414e978636a7e241909504" - ], - "layout": "IPY_MODEL_da0787b239764847a731083997780a85" - } - }, - "9d184ed175f0403fb03c2e13dfd04e0a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b78aa40816e44f7fbebcb24ca68818b3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9d184ed175f0403fb03c2e13dfd04e0a", - "max": 10, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_724778729161445c98b187031ae4f67c", - "value": 10 - } - }, - "c0eb5bc8f6ee427ca42204b3c56f9a4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "da0787b239764847a731083997780a85": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e87e8d3262c54cfaaa8768505edacda3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_553f3c16839a49d79591d0fc4862bed6", - "placeholder": "โ€‹", - "style": "IPY_MODEL_c0eb5bc8f6ee427ca42204b3c56f9a4e", - "value": "Fetchingโ€‡10โ€‡files:โ€‡100%" - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/notebooks/intro/dpk_intro_1_ray.ipynb b/examples/notebooks/intro/dpk_intro_1_ray.ipynb deleted file mode 100644 index b2feb9135..000000000 --- a/examples/notebooks/intro/dpk_intro_1_ray.ipynb +++ /dev/null @@ -1,4359 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", - "metadata": { - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" - }, - "source": [ - "# Data Prep Kit Demo 1 - Ray Version\n", - "\n", - "This notebook will introduce DPK and showcase some of it's capabilities.\n", - "\n", - "Here is the workflow\n", - "\n", - "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "b15976e3", - "metadata": { - "id": "b15976e3" - }, - "source": [ - "## How to run this notebook\n", - "\n", - "Two options:\n", - "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", - "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", - "\n", - "The notebook will work as in both environments" - ] - }, - { - "cell_type": "markdown", - "id": "eb8b0d5c", - "metadata": { - "id": "eb8b0d5c" - }, - "source": [ - "## Step-1: Inspect the Data\n", - "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", - "\n", - "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, - "source": [ - "## Step-2: Figure out Runtime Environment\n", - "\n", - "### 2.1 - Determine runtime\n", - "\n", - "Determine if we are running on Google colab or local python environment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1fe354b7", - "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "8e7c104b", - "metadata": { - "id": "8e7c104b" - }, - "source": [ - "### 2.2 -Download Data if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3309799e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3309799e", - "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 2.3 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1fcec577", - "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit==0.2.1 \\\n", - " data-prep-toolkit-transforms==0.2.1 \\\n", - " data-prep-toolkit-transforms-ray==0.2.1 \\\n", - " deepsearch-toolkit" - ] - }, - { - "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, - "source": [ - "### 2.4 - Restart Runtime\n", - "\n", - "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", - "\n", - "You do this by going to **`Runtime --> Restart Session`**\n", - "\n", - "Then you can continue to the next step (no need to re-run the notebook)" - ] - }, - { - "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration" - ] - }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, - "source": [ - "### 2.1 - Basic Config" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e4YMZrBuFycl", - "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "33345487", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "33345487", - "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", - "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", - "MY_CONFIG.RAY_MEMORY_GB: 2\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", - "\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", - "\n", - "## RAY CONFIGURATION\n", - "### For local runs, we can use more parallelism\n", - "### For google colab, be conservative\n", - "\n", - "if RUNNING_IN_COLAB:\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - "else: # local run\n", - " num_cpus_available = os.cpu_count()\n", - " # print (num_cpus_available)\n", - "\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", - "\n", - "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", - "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", - "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b15e6827", - "metadata": { - "id": "b15e6827" - }, - "outputs": [], - "source": [ - "## Add parent dir to path\n", - "import os,sys\n", - "\n", - "this_dir = os.path.abspath('')\n", - "parent_dir = os.path.dirname(this_dir)\n", - "sys.path.append (os.path.abspath (parent_dir))" - ] - }, - { - "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, - "source": [ - "### 2.2 - Setup input/outpur directories" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Cleared output directory\n" - ] - } - ], - "source": [ - "import os, sys\n", - "import shutil\n", - "\n", - "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", - " raise Exception (f\"โŒ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", - "\n", - "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", - "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", - "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", - "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", - "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", - "\n", - "## clear output folder\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", - "\n", - "print (\"โœ… Cleared output directory\")" - ] - }, - { - "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, - "source": [ - "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", - "\n", - "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", - "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", - "metadata": { - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" - }, - "source": [ - "### 3.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" - ] - } - ], - "source": [ - "STAGE = 1\n", - "\n", - "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = output_parquet_dir\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 3.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "14a36e73-a186-4431-a755-f46ccb691130" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", - "13:30:44 INFO - pipeline id pipeline_id\n", - "13:30:44 INFO - code location None\n", - "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", - "13:30:44 INFO - actor creation delay 0\n", - "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", - "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "13:30:44 INFO - Running locally\n", - "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 110376.42it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:00<00:00, 73713.60it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:1 completed successfully\n", - "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", - "Wall time: 31.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from pdf2parquet_transform import (\n", - " pdf2parquet_contents_type_cli_param,\n", - " pdf2parquet_contents_types,\n", - ")\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", - "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", - "\n", - "from data_processing.utils import GB, ParamsUtils\n", - "\n", - "\n", - "# create parameters\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", - "ingest_config = {\n", - " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", - "}\n", - "\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": 1, # so model download to cleanup works properly\n", - "\n", - "}\n", - "\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", - "# create launcher\n", - "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", - "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "5ca790e0", - "metadata": { - "id": "5ca790e0" - }, - "source": [ - "### 3.3 - Inspect Generated output\n", - "\n", - "Here we should see one entry per input file processed." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "fe59563d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "id": "fe59563d", - "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output dimensions (rows x columns)= (2, 12)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...101162e5639f-f922-4ccc-a041-3cb02f1cfd83pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011f3c0ac2e-1de2-472b-8216-2043f3b3e9d1pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdf
\n", - "
" - ], - "text/plain": [ - " filename contents num_pages \\\n", - "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "\n", - " num_tables num_doc_elements document_id ext \\\n", - "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", - "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(5)\n", - "\n", - "## To display certain columns\n", - "#parquet_df[['column1', 'column2', 'column3']].head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, - "source": [ - "\n", - "### 3.4 - Understand the output\n", - "\n", - "Here are some interesting attributes to note:\n", - "\n", - "- **filename** : original filename\n", - "- **contents** : text\n", - "- **document_id**: unique id (UUID) assignd to this document\n", - "- **hash** : hash of document\n", - "- **pdf_convert_time** : time to convert this pdf in seconds\n", - "\n", - "Let's inspect the **contents** column. See how the text is being divided up!" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f870e624", - "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", - " 'filename': 'mars.pdf',\n", - " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.35137939,\n", - " 654.45184326,\n", - " 169.88169861,\n", - " 667.98492432],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.09541321,\n", - " 630.68127441,\n", - " 210.66503906,\n", - " 642.34405518],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.84518433,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.02520752],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.18510437,\n", - " 570.83258057,\n", - " 374.99838257,\n", - " 581.07043457],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about the Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.22866821,\n", - " 542.98168945,\n", - " 163.86282349,\n", - " 554.45288086],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87440491,\n", - " 500.84011841,\n", - " 477.48345947,\n", - " 534.55810547],\n", - " 'page': 1,\n", - " 'span': [0, 196]}],\n", - " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", - " 'desert world with a thin atmosphere composed '\n", - " 'primarily of carbon dioxide. Its reddish hue comes '\n", - " 'from iron oxide, or rust, prevalent on its surface.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.2026062,\n", - " 482.90710449,\n", - " 237.04431152,\n", - " 493.07443237],\n", - " 'page': 1,\n", - " 'span': [0, 23]}],\n", - " 'text': 'Basic facts about Mars:',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 453.019104,\n", - " 477.48171997,\n", - " 474.9703064],\n", - " 'page': 1,\n", - " 'span': [0, 78]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 228 million '\n", - " 'kilometers (142 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.79351807,\n", - " 431.73287964,\n", - " 451.2142334],\n", - " 'page': 1,\n", - " 'span': [0, 64]}],\n", - " 'text': 'ยท Rotation Period: 24.6 hours (one Martian day - '\n", - " 'called a \"sol\")',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 429.10913086,\n", - " 365.9559021,\n", - " 438.83737183],\n", - " 'page': 1,\n", - " 'span': [0, 44]}],\n", - " 'text': 'ยท Moons: Two small moons, Phobos and Deimos.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.51646423],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "import pprint\n", - "import json\n", - "\n", - "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", - "# json.loads(output_df.iloc[0, ]['contents'])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e1a10c2d", - "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", - " 'filename': 'earth.pdf',\n", - " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.30961609,\n", - " 654.45184326,\n", - " 174.04208374,\n", - " 667.93347168],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.12528992,\n", - " 630.69073486,\n", - " 210.66503906,\n", - " 642.27935791],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87112427,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.04595947],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.20942688,\n", - " 570.81555176,\n", - " 375.57919312,\n", - " 581.08459473],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about our Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.15542603,\n", - " 542.98168945,\n", - " 167.32983398,\n", - " 554.36669922],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.91053772,\n", - " 512.46295166,\n", - " 477.84887695,\n", - " 534.48431396],\n", - " 'page': 1,\n", - " 'span': [0, 107]}],\n", - " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", - " 'planet. Earth is the only place we know of with life.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.30151367,\n", - " 494.86206055,\n", - " 240.17156982,\n", - " 505.07229614],\n", - " 'page': 1,\n", - " 'span': [0, 24]}],\n", - " 'text': 'Basic facts about Earth:',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 464.97409058,\n", - " 477.47979736,\n", - " 487.02810669],\n", - " 'page': 1,\n", - " 'span': [0, 79]}],\n", - " 'text': 'ยท Distance from the Sun: Average of 149.6 million '\n", - " 'kilometers (93 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 452.86901855,\n", - " 317.90722656,\n", - " 463.24041748],\n", - " 'page': 1,\n", - " 'span': [0, 37]}],\n", - " 'text': 'ยท Rotation Period: 24 hours (one day)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.71496582,\n", - " 396.66357422,\n", - " 451.19915771],\n", - " 'page': 1,\n", - " 'span': [0, 52]}],\n", - " 'text': 'ยท Moons: One moon, called Luna or simply \"the Moon\".',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.53633118],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" - ] - }, - { - "cell_type": "markdown", - "id": "72274586", - "metadata": { - "id": "72274586" - }, - "source": [ - "## Step-4: Doc chunks\n", - "\n", - "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", - "\n", - "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", - "\n", - "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", - "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", - "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", - "which provides the required JSON structure." - ] - }, - { - "cell_type": "markdown", - "id": "96198fa6", - "metadata": { - "id": "96198fa6" - }, - "source": [ - "### 4.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "305f00a3", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "305f00a3", - "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" - ] - } - ], - "source": [ - "STAGE = 2\n", - "\n", - "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_chunk_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "369f2cd1", - "metadata": { - "id": "369f2cd1" - }, - "source": [ - "### 4.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5b7b18d5", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5b7b18d5", - "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "13:31:12 INFO - pipeline id pipeline_id\n", - "13:31:12 INFO - code location None\n", - "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:12 INFO - actor creation delay 0\n", - "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:12 INFO - Running locally\n", - "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", - "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:2 completed successfully\n", - "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", - "Wall time: 18.9 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc_chunk arguments\n", - " # ...\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "213afdf6", - "metadata": { - "id": "213afdf6" - }, - "source": [ - "### 4.3 - Inspect Generated output\n", - "\n", - "We would see documents are split into many chunks" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d8138d43", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 897 - }, - "id": "d8138d43", - "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Files processed : 2\n", - "Chunks created : 8\n", - "Input data dimensions (rows x columns)= (2, 12)\n", - "Output data dimensions (rows x columns)= (8, 16)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (f\"Files processed : {input_df.shape[0]:,}\")\n", - "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "9e9ca75c", - "metadata": { - "id": "9e9ca75c" - }, - "source": [ - "### 4.4 - Understanding the Output\n", - "\n", - "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", - "\n", - "See how **document_id** is carried throughout. This helps us identify original documents.\n", - "\n", - "Also note **contents** is now plain text (not JSON as before)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3090c950", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "3090c950", - "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\nยท Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Solar System\\nFor more details about the Solar...\n", - "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "3 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "5 earth.pdf Solar System\\nFor more details about our Solar...\n", - "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "7 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d5f151ae", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d5f151ae", - "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 3------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "20217298", - "metadata": { - "id": "20217298" - }, - "source": [ - "## Step-5: DOC ID generation\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", - "\n", - "**This is a pre-requisite for fuzzy dedup** in the pipeline." - ] - }, - { - "cell_type": "markdown", - "id": "66811f5b", - "metadata": { - "id": "66811f5b" - }, - "source": [ - "### 5.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "1f747c0d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1f747c0d", - "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" - ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE = 3\n", - "\n", - "input_folder = output_chunk_dir\n", - "output_folder = output_docid_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "18aa0fe1", - "metadata": { - "id": "18aa0fe1" - }, - "source": [ - "### 5.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "f6e9e145", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f6e9e145", - "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", - "13:31:29 INFO - pipeline id pipeline_id\n", - "13:31:29 INFO - code location None\n", - "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:29 INFO - actor creation delay 0\n", - "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", - "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:29 INFO - Running locally\n", - "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", - "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:3 completed successfully\n", - "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", - "Wall time: 15.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc id configuration\n", - " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"chunk_hash\",\n", - " \"doc_id_int_column\": \"chunk_id\",\n", - "}\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "4954402f", - "metadata": { - "id": "4954402f" - }, - "source": [ - "### 5.3 - Inspect Generated output\n", - "\n", - "You will notice we have two extra columns\n", - "\n", - "- **hash_column**\n", - "- **int_id_column**\n", - "\n", - "But still the same number or rows as before" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "1911179a", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 860 - }, - "id": "1911179a", - "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 16)\n", - "Output data dimensions (rows x columns)= (8, 18)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "852829dc", - "metadata": { - "id": "852829dc" - }, - "source": [ - "## Step-6: Exact Dedup\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", - "metadata": { - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" - }, - "source": [ - "### 6.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "4c7a1b94", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4c7a1b94", - "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" - ] - } - ], - "source": [ - "STAGE = 4\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_exact_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", - "metadata": { - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" - }, - "source": [ - "### 6.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "13:31:45 INFO - pipeline id pipeline_id\n", - "13:31:45 INFO - code location None\n", - "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:45 INFO - actor creation delay 0\n", - "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", - "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:45 INFO - Running locally\n", - "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", - "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:4 completed successfully\n", - "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", - "Wall time: 15.2 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # ededup parameters\n", - " \"ededup_hash_cpu\": 0.5,\n", - " \"ededup_num_hashes\": 2,\n", - " \"ededup_doc_column\": \"contents\",\n", - " \"ededup_doc_id_column\": \"chunk_hash\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": { - "id": "eaf1c3c3" - }, - "source": [ - "### 6.3 - Inspect Generated output" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "d824ebf6", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 815 - }, - "id": "d824ebf6", - "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (7, 19)\n", - "Input chunks before exact dedupe : 8\n", - "Output chunks after exact dedupe : 7\n", - "Duplicate chunks removed : 1\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "82cc9bb0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 269 - }, - "id": "82cc9bb0", - "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nFor more details about the Solar...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "4 earth.pdf Solar System\\nFor more details about our Solar...\n", - "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "6 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "cc61dffa", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cc61dffa", - "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "383f40ba", - "metadata": { - "id": "383f40ba" - }, - "source": [ - "### 6.4 - Understanding the output\n", - "\n", - "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", - "\n", - "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", - "\n", - "```text\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", - "metadata": { - "id": "85309751-8556-41c6-ac32-84acc941bc8d" - }, - "source": [ - "## Step-7: Fuzzy Dedup\n", - "\n", - "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", - "the data further.\n", - "\n", - "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." - ] - }, - { - "cell_type": "markdown", - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", - "metadata": { - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" - }, - "source": [ - "### 7.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" - ] - } - ], - "source": [ - "## Input to this component is the output of doc_id generator component.\n", - "\n", - "STAGE = 5\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_fuzzy_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", - "metadata": { - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" - }, - "source": [ - "### 7.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", - "13:32:00 INFO - pipeline id pipeline_id\n", - "13:32:00 INFO - code location None\n", - "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:00 INFO - actor creation delay 0\n", - "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", - "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:00 INFO - Running locally\n", - "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", - "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", - "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:5 completed successfully\n", - "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", - "Wall time: 36.6 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "\n", - "# create parameters\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # Orchestration parameters\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # columns used\n", - " \"fdedup_doc_column\": \"contents\",\n", - " \"fdedup_id_column\": \"chunk_id\",\n", - " \"fdedup_cluster_column\": \"chunk_hash\",\n", - " # infrastructure\n", - " \"fdedup_bucket_cpu\": 0.3,\n", - " \"fdedup_doc_cpu\": 0.3,\n", - " \"fdedup_mhash_cpu\": 0.3,\n", - " \"fdedup_num_doc_actors\": 1,\n", - " \"fdedup_num_bucket_actors\": 1,\n", - " \"fdedup_num_minhash_actors\": 1,\n", - " \"fdedup_num_preprocessors\": 1,\n", - " # fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "}\n", - "\n", - "# Pass commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "a6f8cd11", - "metadata": { - "id": "a6f8cd11" - }, - "source": [ - "### 7.3 - Inspect Generated output" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "e899ad60", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 677 - }, - "id": "e899ad60", - "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (6, 18)\n", - "Duplicate chunks removed by fuzzy-dedupe: 2\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "ab7ea52b", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - }, - "id": "ab7ea52b", - "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\nยท Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\nยท Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\nยท Distance from the S...\n", - "3 earth.pdf Solar System\\nFor more details about our Solar...\n", - "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "5 earth.pdf Earth\\nBasic facts about Earth:\\nยท Distance fr..." - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "6bdd3515", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6bdd3515", - "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "ยท Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "ยท Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "ยท Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Basic facts about Earth:\n", - "ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "ยท Rotation Period: 24 hours (one day)\n", - "ยท Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "2b34d9c6", - "metadata": { - "id": "2b34d9c6" - }, - "source": [ - "### 7.4- Understanding the output\n", - "\n", - "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", - "\n", - "These are pretty similar chunks except for the words 'the' and 'our'\n", - "\n", - "**earth.pdf**\n", - "\n", - "`For more details about *our* Solar system see Chapter 1.`\n", - "\n", - "**mars.pdf**\n", - "\n", - "`For more details about *the* Solar system see Chapter 1.`\n", - "\n", - "Pretty neat, eh? ๐Ÿ‘\n", - "\n", - "### Configuring Fuzzy de-dupe\n", - "\n", - "You can tweak fuzzy dedupe by tweaking the following parameters\n", - "\n", - "```python\n", - "# fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "```\n", - "\n", - "In our case, we set `fdedup_threshold` parameter to 0.7. \n" - ] - }, - { - "cell_type": "markdown", - "id": "5370950a-2a3a-4143-8218-f9b4808099ba", - "metadata": { - "id": "5370950a-2a3a-4143-8218-f9b4808099ba" - }, - "source": [ - "## Step-8: Text encoding\n", - "\n", - "Encode text for the vector storage." - ] - }, - { - "cell_type": "markdown", - "id": "85aba685", - "metadata": { - "id": "85aba685" - }, - "source": [ - "### 8.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "๐Ÿƒ๐Ÿผ STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" - ] - } - ], - "source": [ - "STAGE = 6\n", - "\n", - "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_embeddings_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "c97545f4", - "metadata": { - "id": "c97545f4" - }, - "source": [ - "### 8.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 914, - "referenced_widgets": [ - "8b7571c585df431eb901fcdebdf8177e", - "06107a2f48b3491f91bbe84e46e10ba0", - "bd74356eca18423aa0373c808d9097e3", - "7e13e8779a81400f996d4428c74acfaf", - "a75892696be546a3970962bae7bf732a", - "68997339f13240a4824a9e416096bee4", - "919b086abd314077bbff75687392bd91", - "b4c209371e7a403986991a786cfb296d", - "6c08de2dd9a2402c90b1a7a645db9b13", - "91fff81a1de8487c9009e872b751edb0", - "ada62d24cbcf4361acbb21808f334d33" - ] - }, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "13:32:37 INFO - pipeline id pipeline_id\n", - "13:32:37 INFO - code location None\n", - "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:37 INFO - actor creation delay 0\n", - "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", - "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:37 INFO - Running locally\n", - "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", - "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Stage:6 completed successfully\n", - "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", - "Wall time: 22.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # text_encoder\n", - " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", - "}\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "# create launcher\n", - "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", - "# Launch the ray actor(s) to process the input\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"โŒ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "b734852c", - "metadata": { - "id": "b734852c" - }, - "source": [ - "### 8.3 - Inspect Generated output\n", - "\n", - "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "7b1c1d09", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 659 - }, - "id": "7b1c1d09", - "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (6, 18)\n", - "Output data dimensions (rows x columns)= (6, 19)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\nยท Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\nยท Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\nยท Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\nยท Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", - "\n", - " embeddings \n", - "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", - "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", - "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", - "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", - "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", - "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "โœ… Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" - ] - } - ], - "source": [ - "import shutil\n", - "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", - "\n", - "print (f\"โœ… Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "dc0a6728", - "metadata": { - "id": "dc0a6728" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "dpk-3-basic-022dev1-py311", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "06107a2f48b3491f91bbe84e46e10ba0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", - "placeholder": "โ€‹", - "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", - "value": "" - } - }, - "68997339f13240a4824a9e416096bee4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6c08de2dd9a2402c90b1a7a645db9b13": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "7e13e8779a81400f996d4428c74acfaf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", - "placeholder": "โ€‹", - "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", - "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" - } - }, - "8b7571c585df431eb901fcdebdf8177e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", - "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", - "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" - ], - "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" - } - }, - "919b086abd314077bbff75687392bd91": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "91fff81a1de8487c9009e872b751edb0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a75892696be546a3970962bae7bf732a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ada62d24cbcf4361acbb21808f334d33": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b4c209371e7a403986991a786cfb296d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": "20px" - } - }, - "bd74356eca18423aa0373c808d9097e3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", - "max": 1, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", - "value": 0 - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/notebooks/intro/images/data-prep-kit-3-workflow.png b/examples/notebooks/intro/images/data-prep-kit-3-workflow.png deleted file mode 100644 index 851adbfeb..000000000 Binary files a/examples/notebooks/intro/images/data-prep-kit-3-workflow.png and /dev/null differ diff --git a/examples/notebooks/intro/.gitignore b/examples/notebooks/pdf-processing-1/.gitignore similarity index 100% rename from examples/notebooks/intro/.gitignore rename to examples/notebooks/pdf-processing-1/.gitignore diff --git a/examples/notebooks/pdf-processing-1/README.md b/examples/notebooks/pdf-processing-1/README.md new file mode 100644 index 000000000..c7fdf8ffb --- /dev/null +++ b/examples/notebooks/pdf-processing-1/README.md @@ -0,0 +1,53 @@ +# PDF Processing with Data Prep Kit + +Show cases Data Prep Kit capabilities of processing PDFs. + +We will demonstrate the following: + +- Extracting text from PDF files +- removing duplicates (exact and fuzzy matches) +- accessing document quality and removing documents containing spam words, placeholder content like 'lorem ipsum' ..etc. + +**Workflow** + +![](images/data-prep-kit-3-workflow.png) + +## Setting up Python Environment + +The code can be run on either + +1. Google colab: very easy to run; no local setup needed. +2. On your local Python environment. Here is a quick guide. You can find instructions for latest version [here](../../../README.md#-getting-started) + +```bash +conda create -n data-prep-kit -y python=3.11 +conda activate data-prep-kit + +# install the following in 'data-prep-kit' environment +cd examples/notebooks/pdf-processing-1 +pip3 install -r requirements.txt + +# start jupyter and run the notebooks with this jupyter +jupyter lab +``` + +## Data Files + +PDF files are located in [examples/data-files/pdf-processing-1](../../data-files/pdf-processing-1/) + +## Running the code + +[python version](pdf_processing_1_python.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb) + +[ray version](pdf_processing_1_ray.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb) + +## Troubleshooting + +If you encounter any errors loading libraries, try creating a custom kernel and using it to run the notebooks. + +```bash +python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit" +# and select this kernel within jupyter notebook +``` + + diff --git a/examples/notebooks/intro/my_utils.py b/examples/notebooks/pdf-processing-1/archived/my_utils.py similarity index 100% rename from examples/notebooks/intro/my_utils.py rename to examples/notebooks/pdf-processing-1/archived/my_utils.py diff --git a/examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw similarity index 63% rename from examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw rename to examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw index c0525c556..03b19ce3c 100644 --- a/examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw +++ b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw @@ -5,44 +5,8 @@ "elements": [ { "type": "image", - "version": 128, - "versionNonce": 146671843, - "index": "b45", - "isDeleted": false, - "id": "nQdFTOsh8Rjwn3poFcnOO", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 258.1818181818182, - "y": 213.63636363636363, - "strokeColor": "transparent", - "backgroundColor": "transparent", - "width": 64, - "height": 64, - "seed": 222183398, - "groupIds": [ - "4aSnKsxGoqeqA7eYu4s2e" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726186954844, - "link": null, - "locked": false, - "status": "saved", - "fileId": "83ba3062a1490699e3ccc129acb25b1f4ec5534d", - "scale": [ - 1, - 1 - ] - }, - { - "type": "image", - "version": 240, - "versionNonce": 2054222979, + "version": 457, + "versionNonce": 173110248, "index": "b46", "isDeleted": false, "id": "hlPJZs7lUbLYhuRbSmYHs", @@ -52,29 +16,23 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 260.90909090909093, - "y": 285.4545454545455, + "x": 194.90909090909093, + "y": 202.4545454545455, "strokeColor": "transparent", "backgroundColor": "transparent", "width": 64, "height": 64, "seed": 961787386, - "groupIds": [ - "4aSnKsxGoqeqA7eYu4s2e" - ], + "groupIds": [], "frameId": null, "roundness": null, "boundElements": [ { "id": "FVhCmDYbWjGck9rgcESwp", "type": "arrow" - }, - { - "id": "JMprrs8mNVD4CrqUlVm7i", - "type": "arrow" } ], - "updated": 1726186954844, + "updated": 1737528573258, "link": null, "locked": false, "status": "saved", @@ -82,12 +40,13 @@ "scale": [ 1, 1 - ] + ], + "crop": null }, { "type": "arrow", - "version": 2550, - "versionNonce": 1240871476, + "version": 2976, + "versionNonce": 1926996376, "index": "b47", "isDeleted": false, "id": "FVhCmDYbWjGck9rgcESwp", @@ -97,12 +56,12 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 823.5583207607388, - "y": 273.73602641681657, + "x": 583.0728843528818, + "y": 265.0654681139756, "strokeColor": "#2f9e44", "backgroundColor": "transparent", - "width": 154.2895204048931, - "height": 2.3372664247598323, + "width": 221.74126076768994, + "height": 0.598117686721821, "seed": 1954615226, "groupIds": [], "frameId": null, @@ -110,16 +69,21 @@ "type": 2 }, "boundElements": [], - "updated": 1726708776348, + "updated": 1737528696232, "link": null, "locked": false, "startBinding": { - "elementId": "Wxv71stEiYRpNjyhzzXgO", - "focus": 1.202109076005182, - "gap": 9.103775306193256, + "elementId": "YFlD_rDw6IwCctPG9BjYf", + "focus": 0.841290319837998, + "gap": 12.052870784360664, + "fixedPoint": null + }, + "endBinding": { + "elementId": "DolT9H5aqzEugA7sUfNlx", + "focus": -0.14468495613909563, + "gap": 10.4071488270705, "fixedPoint": null }, - "endBinding": null, "lastCommittedPoint": null, "startArrowhead": null, "endArrowhead": "arrow", @@ -129,61 +93,15 @@ 0 ], [ - 154.2895204048931, - 2.3372664247598323 + 221.74126076768994, + -0.598117686721821 ] ] }, - { - "type": "text", - "version": 324, - "versionNonce": 1281521869, - "index": "b4M", - "isDeleted": false, - "id": "zSJvmm-7DrsR5-qRb96Kl", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 595.4118679291607, - "y": 242.27481706603328, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 141.51840079198635, - "height": 59.453152259008114, - "seed": 409665722, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [ - { - "id": "JMprrs8mNVD4CrqUlVm7i", - "type": "arrow" - }, - { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" - } - ], - "updated": 1726186894805, - "link": null, - "locked": false, - "fontSize": 23.781260903603247, - "fontFamily": 1, - "text": "2. split into\nchunks", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "2. split into\nchunks", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "arrow", - "version": 848, - "versionNonce": 138401069, + "version": 1191, + "versionNonce": 1753926120, "index": "b4N", "isDeleted": false, "id": "JMprrs8mNVD4CrqUlVm7i", @@ -193,12 +111,12 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 329.1268602850381, - "y": 278.24885892455757, + "x": 303.3582097473162, + "y": 267.24885892455757, "strokeColor": "#2f9e44", "backgroundColor": "#b2f2bb", - "width": 185.2530890548909, - "height": 2.823455039174007, + "width": 198.02173959261273, + "height": 2.6228850442226985, "seed": 1319994682, "groupIds": [], "frameId": null, @@ -206,19 +124,19 @@ "type": 2 }, "boundElements": [], - "updated": 1726186962183, + "updated": 1737528662023, "link": null, "locked": false, "startBinding": { - "elementId": "hlPJZs7lUbLYhuRbSmYHs", - "focus": -1.189794049219074, - "gap": 7.205686529987929, + "elementId": "QSiEFZIoz081ipwdmU8sg", + "focus": 0.36390758833591985, + "gap": 4.736856944692818, "fixedPoint": null }, "endBinding": { "elementId": "YFlD_rDw6IwCctPG9BjYf", - "focus": 1.1403432588201572, - "gap": 6.460959750980123, + "focus": -0.7972060339621995, + "gap": 9.46095975098018, "fixedPoint": null }, "lastCommittedPoint": null, @@ -230,15 +148,15 @@ 0 ], [ - 185.2530890548909, - -2.823455039174007 + 198.02173959261273, + -2.6228850442226985 ] ] }, { "type": "text", - "version": 757, - "versionNonce": 361576332, + "version": 865, + "versionNonce": 1985915368, "index": "b4O", "isDeleted": false, "id": "G0k27V_VE7lyh7YGr_fts", @@ -248,11 +166,11 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 1128.9917648038, - "y": 212.9780740734803, + "x": 934.9917648037998, + "y": 247.9780740734803, "strokeColor": "#1e1e1e", "backgroundColor": "#b2f2bb", - "width": 110.85037231445312, + "width": 100.90922546386719, "height": 58.225670034857664, "seed": 970452474, "groupIds": [], @@ -264,23 +182,23 @@ "type": "arrow" } ], - "updated": 1726708803406, + "updated": 1737528832732, "link": null, "locked": false, "fontSize": 23.290268013943066, "fontFamily": 1, - "text": "4. dedupe\n(exact)", + "text": "3. exact\ndedupe", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "4. dedupe\n(exact)", + "originalText": "3. exact\ndedupe", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 598, - "versionNonce": 1689279715, + "version": 614, + "versionNonce": 181505944, "index": "b4g", "isDeleted": false, "id": "XUbC5cWQCm-GEFrdqZW7g", @@ -290,8 +208,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 333.94038113680745, - "y": 243.15978750685963, + "x": 319.94038113680745, + "y": 233.15978750685963, "strokeColor": "#1e1e1e", "backgroundColor": "#ffc9c9", "width": 173.54608154296875, @@ -306,7 +224,7 @@ "type": "arrow" } ], - "updated": 1726187078639, + "updated": 1737528653755, "link": null, "locked": false, "fontSize": 22.766190549743982, @@ -319,183 +237,10 @@ "autoResize": true, "lineHeight": 1.25 }, - { - "type": "image", - "version": 145, - "versionNonce": 1461008621, - "index": "b4h", - "isDeleted": false, - "id": "XH-Rt0Q5-K2g4tM9reh76", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 520.8409090909091, - "y": 209.88636363636368, - "strokeColor": "transparent", - "backgroundColor": "transparent", - "width": 64, - "height": 64, - "seed": 1159948140, - "groupIds": [ - "KKvJ56bTHwzAbN8YXYU0-" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726186894805, - "link": null, - "locked": false, - "status": "saved", - "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", - "scale": [ - 1, - 1 - ] - }, - { - "type": "image", - "version": 193, - "versionNonce": 1127846733, - "index": "b4i", - "isDeleted": false, - "id": "YFlD_rDw6IwCctPG9BjYf", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 520.8409090909091, - "y": 279.8863636363637, - "strokeColor": "transparent", - "backgroundColor": "transparent", - "width": 64, - "height": 64, - "seed": 1369151980, - "groupIds": [ - "KKvJ56bTHwzAbN8YXYU0-" - ], - "frameId": null, - "roundness": null, - "boundElements": [ - { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" - }, - { - "id": "JMprrs8mNVD4CrqUlVm7i", - "type": "arrow" - } - ], - "updated": 1726186894805, - "link": null, - "locked": false, - "status": "saved", - "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", - "scale": [ - 1, - 1 - ] - }, - { - "type": "arrow", - "version": 753, - "versionNonce": 1832909987, - "index": "b4j", - "isDeleted": false, - "id": "0wYqjwjKHCGbx7CfmDR__", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 0, - "opacity": 100, - "angle": 0, - "x": 587.6995151292258, - "y": 276.08728311464677, - "strokeColor": "#2f9e44", - "backgroundColor": "#b2f2bb", - "width": 160.10395921482052, - "height": 0.6238794650969908, - "seed": 1397245780, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726186894829, - "link": null, - "locked": false, - "startBinding": { - "elementId": "YFlD_rDw6IwCctPG9BjYf", - "focus": -1.1101505124640194, - "gap": 3.799080521716917, - "fixedPoint": null - }, - "endBinding": { - "elementId": "zSJvmm-7DrsR5-qRb96Kl", - "focus": -0.1259939432648205, - "gap": 10.873205622899263, - "fixedPoint": null - }, - "lastCommittedPoint": null, - "startArrowhead": null, - "endArrowhead": "arrow", - "points": [ - [ - 0, - 0 - ], - [ - 160.10395921482052, - -0.6238794650969908 - ] - ] - }, - { - "type": "text", - "version": 19, - "versionNonce": 1725165603, - "index": "b4t", - "isDeleted": false, - "id": "56KAsZE3Fub50OzL9XJ35", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 344.7055268721148, - "y": 290.01136363636374, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 137.6798553466797, - "height": 25, - "seed": 961622755, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726187031887, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "(pdf2parquet)", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "(pdf2parquet)", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "text", - "version": 89, - "versionNonce": 1217800429, + "version": 132, + "versionNonce": 1504935576, "index": "b4u", "isDeleted": false, "id": "GEwyTqhl4LrSwcaOeKRT5", @@ -505,71 +250,34 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 514.7055268721148, - "y": 356.01136363636374, + "x": 518.7055268721148, + "y": 383.01136363636374, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 74.97993469238281, + "width": 92.63992309570312, "height": 50, "seed": 31755757, "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726187172155, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "parquet\nfiles", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "parquet\nfiles", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "text", - "version": 273, - "versionNonce": 821721012, - "index": "b5F", - "isDeleted": false, - "id": "ZGkHBN9UBrJLYPIlm-KTj", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1355.555487199263, - "y": 305.51136363636374, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 118.5198974609375, - "height": 50, - "seed": 1591407981, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708923087, + "updated": 1737528618509, "link": null, "locked": false, "fontSize": 20, "fontFamily": 5, - "text": "duplicate 'B'\nis removed", + "text": "markdown\ntext", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "duplicate 'B'\nis removed", + "originalText": "markdown\ntext", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 747, - "versionNonce": 104645940, + "version": 804, + "versionNonce": 859000296, "index": "b5G", "isDeleted": false, "id": "DolT9H5aqzEugA7sUfNlx", @@ -579,34 +287,39 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 827.643003983931, - "y": 226.3985286189349, + "x": 596.643003983931, + "y": 231.3985286189349, "strokeColor": "#1e1e1e", "backgroundColor": "#b2f2bb", - "width": 166.41502380371094, - "height": 29.112835017428832, + "width": 197.7639923095703, + "height": 58.225670034857664, "seed": 466678605, "groupIds": [], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708795102, + "boundElements": [ + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + } + ], + "updated": 1737528686607, "link": null, "locked": false, "fontSize": 23.290268013943066, "fontFamily": 1, - "text": "3. document id", + "text": "2. document id\n(compute hashes)", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "3. document id", + "originalText": "2. document id\n(compute hashes)", "autoResize": true, "lineHeight": 1.25 }, { "type": "arrow", - "version": 1071, - "versionNonce": 474965812, + "version": 1254, + "versionNonce": 980324072, "index": "b5U", "isDeleted": false, "id": "cXhTkxU13WdQeAv3Z_1mR", @@ -616,12 +329,12 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 1318.993474938044, - "y": 401.3233033689122, + "x": 1145.993474938044, + "y": 268.31133050044286, "strokeColor": "#2f9e44", "backgroundColor": "#b2f2bb", - "width": 0.8539592148204065, - "height": 113.62612053490295, + "width": 167.8539592148204, + "height": 1.6380934033722951, "seed": 605419139, "groupIds": [], "frameId": null, @@ -629,11 +342,21 @@ "type": 2 }, "boundElements": [], - "updated": 1726709016812, + "updated": 1737528943852, "link": null, "locked": false, - "startBinding": null, - "endBinding": null, + "startBinding": { + "elementId": "Qaz1byDgzm-0ZrVLBmU4v", + "focus": -0.37744699407794313, + "gap": 8.76620221077144, + "fixedPoint": null + }, + "endBinding": { + "elementId": "LbPBuhQ2btuEnjbeSDvuK", + "focus": -2.1413835587747667, + "gap": 14.33294663108768, + "fixedPoint": null + }, "lastCommittedPoint": null, "startArrowhead": null, "endArrowhead": "arrow", @@ -643,15 +366,15 @@ 0 ], [ - 0.8539592148204065, - 113.62612053490295 + 167.8539592148204, + 1.6380934033722951 ] ] }, { "type": "text", - "version": 976, - "versionNonce": 988237964, + "version": 1037, + "versionNonce": 1974786200, "index": "b5V", "isDeleted": false, "id": "Ba_pxAykcwH_ZsTbAtduc", @@ -661,34 +384,34 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 1218.815207047896, - "y": 429.9549461276493, + "x": 1160.815207047896, + "y": 234.9549461276493, "strokeColor": "#1e1e1e", "backgroundColor": "#b2f2bb", - "width": 184.07017517089844, - "height": 29.112835017428832, + "width": 98.09219360351562, + "height": 58.225670034857664, "seed": 1665190893, "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726709020882, + "updated": 1737528881336, "link": null, "locked": false, "fontSize": 23.290268013943066, "fontFamily": 1, - "text": "5. fuzzy dedupe", + "text": "4. fuzzy\ndedupe", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "5. fuzzy dedupe", + "originalText": "4. fuzzy\ndedupe", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 580, - "versionNonce": 693951668, + "version": 677, + "versionNonce": 1394703256, "index": "b5h", "isDeleted": false, "id": "XFHbtP2KmiHNNjZhz8ajW", @@ -698,8 +421,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1299.1022727272725, - "y": 517.40625, + "x": 1334.1022727272725, + "y": 178.40625, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, @@ -718,14 +441,14 @@ "id": "OdGsWefGyr6uqMl0wC6mH" } ], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false }, { "type": "text", - "version": 323, - "versionNonce": 1216816692, + "version": 420, + "versionNonce": 2107525272, "index": "b5i", "isDeleted": false, "id": "OdGsWefGyr6uqMl0wC6mH", @@ -735,8 +458,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1315.9786418568, - "y": 522.40625, + "x": 1350.9786418568, + "y": 183.40625, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 13.519989013671875, @@ -748,7 +471,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -763,8 +486,8 @@ }, { "type": "rectangle", - "version": 573, - "versionNonce": 1856782260, + "version": 677, + "versionNonce": 1612348312, "index": "b5j", "isDeleted": false, "id": "NzWqph0M7tEkeTDKLPGZR", @@ -774,8 +497,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1301.1931818181815, - "y": 564.5880681818182, + "x": 1336.1931818181815, + "y": 225.58806818181824, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, @@ -792,16 +515,20 @@ { "type": "text", "id": "K1QK2dyVWiWfd32P8ovQK" + }, + { + "id": "-CNAjEmW6cbufb2V3aXbb", + "type": "arrow" } ], - "updated": 1726708989657, + "updated": 1737530583902, "link": null, "locked": false }, { "type": "text", - "version": 264, - "versionNonce": 334637364, + "version": 364, + "versionNonce": 150023400, "index": "b5k", "isDeleted": false, "id": "K1QK2dyVWiWfd32P8ovQK", @@ -811,11 +538,11 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1317.219552473588, - "y": 569.5880681818182, + "x": 1351.329545454545, + "y": 230.58806818181824, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 17, "height": 25, "seed": 1350557773, "groupIds": [ @@ -824,7 +551,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737530583904, "link": null, "locked": false, "fontSize": 20, @@ -839,8 +566,8 @@ }, { "type": "rectangle", - "version": 680, - "versionNonce": 1002365620, + "version": 777, + "versionNonce": 1889202072, "index": "b5l", "isDeleted": false, "id": "Lf5-FqrnO7iDVhOKUtEnT", @@ -850,8 +577,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1306.9204545454545, - "y": 619.3267045454547, + "x": 1341.9204545454545, + "y": 280.32670454545473, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, @@ -870,14 +597,14 @@ "id": "cTJ-8HZCMcNbXqDHggxAH" } ], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false }, { "type": "text", - "version": 375, - "versionNonce": 213412916, + "version": 472, + "versionNonce": 331955352, "index": "b5m", "isDeleted": false, "id": "cTJ-8HZCMcNbXqDHggxAH", @@ -887,8 +614,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1324.2668248956852, - "y": 624.3267045454547, + "x": 1359.2668248956852, + "y": 285.32670454545473, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 12.579986572265625, @@ -900,7 +627,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -915,8 +642,8 @@ }, { "type": "text", - "version": 141, - "versionNonce": 1757726132, + "version": 238, + "versionNonce": 900065688, "index": "b5n", "isDeleted": false, "id": "LK6nmMo09HhGvAeViRfcK", @@ -926,8 +653,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1274.397727272727, - "y": 523.3664772727274, + "x": 1309.397727272727, + "y": 184.36647727272737, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 12, @@ -939,7 +666,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -954,8 +681,8 @@ }, { "type": "text", - "version": 196, - "versionNonce": 761917108, + "version": 294, + "versionNonce": 1508025832, "index": "b5o", "isDeleted": false, "id": "LbPBuhQ2btuEnjbeSDvuK", @@ -965,8 +692,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1278.397727272727, - "y": 569.6164772727275, + "x": 1313.397727272727, + "y": 230.61647727272748, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, @@ -977,8 +704,13 @@ ], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708993287, + "boundElements": [ + { + "id": "cXhTkxU13WdQeAv3Z_1mR", + "type": "arrow" + } + ], + "updated": 1737528943380, "link": null, "locked": false, "fontSize": 20, @@ -993,8 +725,8 @@ }, { "type": "text", - "version": 385, - "versionNonce": 800257204, + "version": 484, + "versionNonce": 1538941848, "index": "b5p", "isDeleted": false, "id": "tEnh5H4Dm1tA62FJY7ZnT", @@ -1004,8 +736,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1279.647727272727, - "y": 629.6164772727275, + "x": 1314.647727272727, + "y": 290.6164772727275, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, @@ -1017,7 +749,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726709003336, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -1032,8 +764,8 @@ }, { "type": "text", - "version": 307, - "versionNonce": 51819060, + "version": 406, + "versionNonce": 313505768, "index": "b5q", "isDeleted": false, "id": "TExMhRi4612k0BcybcpHE", @@ -1043,8 +775,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1251.2855058149858, - "y": 678.5113636363637, + "x": 1286.2855058149858, + "y": 339.51136363636374, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 143.59986877441406, @@ -1056,7 +788,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737530582726, "link": null, "locked": false, "fontSize": 20, @@ -1069,243 +801,28 @@ "autoResize": true, "lineHeight": 1.25 }, - { - "type": "arrow", - "version": 1039, - "versionNonce": 199529869, - "index": "b5r", - "isDeleted": false, - "id": "KvvwHoDnDT0vBh2bOfiTz", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 0, - "opacity": 100, - "angle": 0, - "x": 1245.243474938044, - "y": 579.5733033689121, - "strokeColor": "#2f9e44", - "backgroundColor": "#b2f2bb", - "width": 192.8960407851796, - "height": 1.126120534903066, - "seed": 1004556899, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188444758, - "link": null, - "locked": false, - "startBinding": null, - "endBinding": null, - "lastCommittedPoint": null, - "startArrowhead": null, - "endArrowhead": "arrow", - "points": [ - [ - 0, - 0 - ], - [ - -192.8960407851796, - 1.126120534903066 - ] - ] - }, - { - "type": "text", - "version": 989, - "versionNonce": 923042467, - "index": "b5s", - "isDeleted": false, - "id": "cPSHqIr9Peb5h5TNxl3Bb", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 0, - "opacity": 100, - "angle": 0, - "x": 1100.5103669600053, - "y": 536.2049461276495, - "strokeColor": "#1e1e1e", - "backgroundColor": "#b2f2bb", - "width": 138.99639892578125, - "height": 29.112835017428832, - "seed": 865272429, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726188447614, - "link": null, - "locked": false, - "fontSize": 23.290268013943066, - "fontFamily": 1, - "text": "6. vectorize", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "6. vectorize", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "diamond", - "version": 103, - "versionNonce": 679668419, - "index": "b5vV", - "isDeleted": false, - "id": "tPvUjMUp7lW3F8V3H2MGV", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 960.0454545454546, - "y": 515.5113636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 63.75, - "height": 45, - "seed": 782762477, - "groupIds": [ - "CuM_sg3LC9KTYRVST18pX" - ], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188516836, - "link": null, - "locked": false - }, - { - "type": "diamond", - "version": 117, - "versionNonce": 224511779, - "index": "b5w", - "isDeleted": false, - "id": "uOIVUAj_hGKNiZ3NnQm2n", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 961.9204545454546, - "y": 564.5113636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 63.75, - "height": 45, - "seed": 1245990083, - "groupIds": [ - "CuM_sg3LC9KTYRVST18pX" - ], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188516836, - "link": null, - "locked": false - }, - { - "type": "diamond", - "version": 122, - "versionNonce": 1205596301, - "index": "b5x", - "isDeleted": false, - "id": "ylh6O0GmjhRAHndHyuEo2", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 966.9204545454546, - "y": 615.7613636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 63.75, - "height": 45, - "seed": 499397773, - "groupIds": [ - "CuM_sg3LC9KTYRVST18pX" - ], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188516836, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 260, - "versionNonce": 1136192621, - "index": "b5y", - "isDeleted": false, - "id": "ekXIjXxtZ6f2w_A-9CVUV", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 938.2855058149859, - "y": 670.7613636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 107.5399169921875, - "height": 25, - "seed": 1616985635, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726188507123, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "embeddings", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "embeddings", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "rectangle", - "version": 381, - "versionNonce": 1618061620, - "index": "b5z", + "version": 589, + "versionNonce": 1049638120, + "index": "b698", "isDeleted": false, - "id": "Uv-8TiLeECJuuNx1yJjtv", + "id": "JNHVvikjirDDllCKotbJC", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 768.5454545454545, - "y": 280.72727272727275, + "x": 844.9545454545454, + "y": 249.68750000000006, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 637818278, + "seed": 848769955, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { @@ -1313,45 +830,45 @@ }, "boundElements": [ { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" + "type": "text", + "id": "8Msc7tXcZdg2UUH2NmUn-" }, { - "type": "text", - "id": "B8Nj-HzRDl-FA-5UJ2hiw" + "id": "M_WCuesgPRdSQ_zqaUtz0", + "type": "arrow" } ], - "updated": 1726708776347, + "updated": 1737528714494, "link": null, "locked": false }, { "type": "text", - "version": 140, - "versionNonce": 1472181260, - "index": "b60", + "version": 348, + "versionNonce": 1968921752, + "index": "b69G", "isDeleted": false, - "id": "B8Nj-HzRDl-FA-5UJ2hiw", + "id": "8Msc7tXcZdg2UUH2NmUn-", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 783.2418233698064, - "y": 285.72727272727275, + "x": 859.6509142788972, + "y": 254.68750000000006, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 17.879989624023438, "height": 25, - "seed": 1971906541, + "seed": 1297532739, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, @@ -1359,33 +876,33 @@ "text": "A'", "textAlign": "center", "verticalAlign": "middle", - "containerId": "Uv-8TiLeECJuuNx1yJjtv", + "containerId": "JNHVvikjirDDllCKotbJC", "originalText": "A'", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 391, - "versionNonce": 1280205492, - "index": "b61", + "version": 626, + "versionNonce": 1609828760, + "index": "b69O", "isDeleted": false, - "id": "l7XMM15Xwzq5xmDF0QvyN", + "id": "fkbHGW5tJ-Ay0sh8h-9hJ", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 764.090909090909, - "y": 186.09090909090912, + "x": 841.4999999999999, + "y": 156.05113636363643, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1556091898, + "seed": 2116216547, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { @@ -1394,40 +911,40 @@ "boundElements": [ { "type": "text", - "id": "SZp9x_uNQ-65LQPMQ768C" + "id": "BNiP4zX7PtFTn_e_5vXX3" } ], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false }, { "type": "text", - "version": 132, - "versionNonce": 809849484, - "index": "b62", + "version": 369, + "versionNonce": 753866392, + "index": "b69V", "isDeleted": false, - "id": "SZp9x_uNQ-65LQPMQ768C", + "id": "BNiP4zX7PtFTn_e_5vXX3", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 780.9672782204367, - "y": 191.09090909090912, + "x": 858.3763691295275, + "y": 161.05113636363643, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 13.519989013671875, "height": 25, - "seed": 912377443, + "seed": 1804210819, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, @@ -1435,83 +952,75 @@ "text": "A", "textAlign": "center", "verticalAlign": "middle", - "containerId": "l7XMM15Xwzq5xmDF0QvyN", + "containerId": "fkbHGW5tJ-Ay0sh8h-9hJ", "originalText": "A", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 413, - "versionNonce": 1599597620, - "index": "b63", + "version": 619, + "versionNonce": 553681816, + "index": "b69d", "isDeleted": false, - "id": "Wxv71stEiYRpNjyhzzXgO", + "id": "QYKbNgibs7-HxaNNr8tfG", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 767.1818181818182, - "y": 234.27272727272725, + "x": 843.5909090909089, + "y": 203.23295454545456, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 775085434, + "seed": 1716177443, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { "type": 3 }, "boundElements": [ - { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" - }, - { - "id": "FVhCmDYbWjGck9rgcESwp", - "type": "arrow" - }, { "type": "text", - "id": "zyU1230-bmsHaQTSoi7Ov" + "id": "C-rwFmAbwI_qgVqpkXy7m" } ], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false }, { "type": "text", - "version": 102, - "versionNonce": 1402151180, - "index": "b64", + "version": 310, + "versionNonce": 1247563928, + "index": "b69l", "isDeleted": false, - "id": "zyU1230-bmsHaQTSoi7Ov", + "id": "C-rwFmAbwI_qgVqpkXy7m", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 783.2081888372248, - "y": 239.27272727272725, + "x": 859.6172797463154, + "y": 208.23295454545456, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 15.219985961914062, "height": 25, - "seed": 1842733667, + "seed": 592678339, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, @@ -1519,33 +1028,33 @@ "text": "B", "textAlign": "center", "verticalAlign": "middle", - "containerId": "Wxv71stEiYRpNjyhzzXgO", + "containerId": "QYKbNgibs7-HxaNNr8tfG", "originalText": "B", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 397, - "versionNonce": 997475764, - "index": "b65", + "version": 714, + "versionNonce": 1354136984, + "index": "b69t", "isDeleted": false, - "id": "IkaeA2i4mlTdmulYEI_na", + "id": "m2Wj9fp76PKCAhrulCmTa", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 771.3636363636363, - "y": 325.3636363636364, + "x": 846.3181818181819, + "y": 339.97159090909105, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1839286010, + "seed": 901963107, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { @@ -1554,265 +1063,1493 @@ "boundElements": [ { "type": "text", - "id": "IgKDOIQhfqb_x9gQh30eh" + "id": "MNgTOO1UYazXucNSjXZ_z" } ], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false }, { "type": "text", - "version": 89, - "versionNonce": 421732236, - "index": "b66", + "version": 409, + "versionNonce": 1162021528, + "index": "b6A", "isDeleted": false, - "id": "IgKDOIQhfqb_x9gQh30eh", + "id": "MNgTOO1UYazXucNSjXZ_z", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 787.3900070190429, - "y": 330.3636363636364, + "x": 863.6645521684126, + "y": 344.97159090909105, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 12.579986572265625, "height": 25, - "seed": 1893385699, + "seed": 1223112963, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, "fontFamily": 5, - "text": "B", + "text": "C", "textAlign": "center", "verticalAlign": "middle", - "containerId": "IkaeA2i4mlTdmulYEI_na", - "originalText": "B", + "containerId": "m2Wj9fp76PKCAhrulCmTa", + "originalText": "C", "autoResize": true, "lineHeight": 1.25 }, { - "type": "rectangle", - "version": 440, - "versionNonce": 1439264564, - "index": "b67", + "type": "text", + "version": 188, + "versionNonce": 1924528024, + "index": "b6AG", "isDeleted": false, - "id": "qGfihx9_lQSyc1F8oQTu0", + "id": "J1KVE_C00rdGo7FWIwu1X", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 772.909090909091, - "y": 369.01136363636374, + "x": 817.7954545454544, + "y": 162.01136363636374, "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 1381062179, + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 1442121325, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "0DIl-np94wHje4sIubFJp" - } - ], - "updated": 1726708776347, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, "link": null, - "locked": false + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 }, { "type": "text", - "version": 133, - "versionNonce": 1496272396, - "index": "b68", + "version": 242, + "versionNonce": 759383192, + "index": "b6AV", "isDeleted": false, - "id": "0DIl-np94wHje4sIubFJp", + "id": "TIEDsM4QhNNDJARAJnvDz", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 790.2554612593218, - "y": 374.01136363636374, - "strokeColor": "#1e1e1e", + "x": 820.7954545454544, + "y": 208.26136363636374, + "strokeColor": "#e03131", "backgroundColor": "transparent", - "width": 12.579986572265625, + "width": 11, "height": 25, - "seed": 1722325443, + "seed": 846611715, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, - "fontFamily": 5, - "text": "C", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "qGfihx9_lQSyc1F8oQTu0", - "originalText": "C", - "autoResize": true, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 290, + "versionNonce": 580841880, + "index": "b6Al", + "isDeleted": false, + "id": "tGvqUuD_kCzfMYn-UX8o-", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 823.2954545454544, + "y": 257.01136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 758667053, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "3", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 421, + "versionNonce": 704446104, + "index": "b6B", + "isDeleted": false, + "id": "IQM8OVr381UGBDKQtda8U", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 823.0454545454544, + "y": 345.26136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 618433805, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 672, + "versionNonce": 336685976, + "index": "b6BV", + "isDeleted": false, + "id": "fJGd6Pf-SaTmbDMUGHhUW", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 847.3972327492455, + "y": 296.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1491526540, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Ax-8fSsrXvrkMhlGAgJgO" + } + ], + "updated": 1737528708101, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 363, + "versionNonce": 2064660632, + "index": "b6C", + "isDeleted": false, + "id": "Ax-8fSsrXvrkMhlGAgJgO", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 863.423603404652, + "y": 301.2812500000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 1943704076, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "fJGd6Pf-SaTmbDMUGHhUW", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 320, + "versionNonce": 313353624, + "index": "b6CV", + "isDeleted": false, + "id": "07qZABiLS71UbigBsFpnK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 821.033596385609, + "y": 301.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1965424820, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "4", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "4", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "arrow", + "version": 2745, + "versionNonce": 1420536808, + "index": "b6D", + "isDeleted": false, + "id": "M_WCuesgPRdSQ_zqaUtz0", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 905.532130562785, + "y": 274.97561555378826, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "width": 162.00146582282412, + "height": 0.6286347709357187, + "seed": 1489010356, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "startBinding": { + "elementId": "JNHVvikjirDDllCKotbJC", + "focus": 0.4403861575576877, + "gap": 13.304857835512394, + "fixedPoint": null + }, + "endBinding": { + "elementId": "NxUqy-MsYDga_9XDrU9l7", + "focus": -0.04300532190875777, + "gap": 1, + "fixedPoint": null + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 162.00146582282412, + -0.6286347709357187 + ] + ] + }, + { + "type": "text", + "version": 311, + "versionNonce": 212346088, + "index": "b6D8", + "isDeleted": false, + "id": "ZGkHBN9UBrJLYPIlm-KTj", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1062.555487199263, + "y": 410.51136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 118.5198974609375, + "height": 50, + "seed": 1591407981, + "groupIds": [ + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897882, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "duplicate 'B'\nis removed", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "duplicate 'B'\nis removed", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 285, + "versionNonce": 1763919848, + "index": "b6DG", + "isDeleted": false, + "id": "wkavhEPwz2TNGwf8xFeLA", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1065.0335963856091, + "y": 172.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 809955212, + "groupIds": [ + "uHtPh4-PiLJtgc-p_Cdgo", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897882, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 653, + "versionNonce": 1883376360, + "index": "b6DO", + "isDeleted": false, + "id": "Qaz1byDgzm-0ZrVLBmU4v", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1089.9545454545455, + "y": 257.1875000000001, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 144156909, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "D2HbgzHXdGyxGppwaWbBy" + }, + { + "id": "cXhTkxU13WdQeAv3Z_1mR", + "type": "arrow" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 410, + "versionNonce": 1998221544, + "index": "b6DV", + "isDeleted": false, + "id": "D2HbgzHXdGyxGppwaWbBy", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1104.6509142788973, + "y": 262.1875000000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 17.879989624023438, + "height": 25, + "seed": 2062418765, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Qaz1byDgzm-0ZrVLBmU4v", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 683, + "versionNonce": 1735136232, + "index": "b6Dd", + "isDeleted": false, + "id": "-LxVJeZLqj0MgI5FEg_pm", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1082.5, + "y": 163.55113636363643, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1514803629, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "trFDjiJr6cfNlCSEKqNjE" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 425, + "versionNonce": 1133598440, + "index": "b6Dl", + "isDeleted": false, + "id": "trFDjiJr6cfNlCSEKqNjE", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1099.3763691295276, + "y": 168.55113636363643, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 13.519989013671875, + "height": 25, + "seed": 1674925069, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "-LxVJeZLqj0MgI5FEg_pm", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 680, + "versionNonce": 269892072, + "index": "b6E", + "isDeleted": false, + "id": "Kxu9owye4gMpRvh7kJ1Nl", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1088.590909090909, + "y": 210.73295454545456, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1938377325, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "UP92rSYiIXnnBFhov6WNx" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 370, + "versionNonce": 1611054312, + "index": "b6EG", + "isDeleted": false, + "id": "UP92rSYiIXnnBFhov6WNx", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1104.6172797463157, + "y": 215.73295454545456, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 707753165, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Kxu9owye4gMpRvh7kJ1Nl", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 707, + "versionNonce": 82763752, + "index": "b6EV", + "isDeleted": false, + "id": "KMOsOR4pOx-ute2ztnw1k", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1094.318181818182, + "y": 345.4715909090911, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 635317229, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "SsRO-f6mzQzf5jQOudz6C" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 401, + "versionNonce": 1054515944, + "index": "b6El", + "isDeleted": false, + "id": "SsRO-f6mzQzf5jQOudz6C", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1111.6645521684127, + "y": 350.4715909090911, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 12.579986572265625, + "height": 25, + "seed": 1382819405, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "KMOsOR4pOx-ute2ztnw1k", + "originalText": "C", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 319, + "versionNonce": 1817576936, + "index": "b6F", + "isDeleted": false, + "id": "US1PK13ekocRlMvOrHSJL", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1066.0335963856091, + "y": 215.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1525760780, + "groupIds": [ + "bQ__H1TgpJXskAm32UBLZ", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 357, + "versionNonce": 980224232, + "index": "b6FV", + "isDeleted": false, + "id": "NxUqy-MsYDga_9XDrU9l7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1068.5335963856091, + "y": 261.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 1116920372, + "groupIds": [ + "4mN8vM1PMjtKHfzWdqXES", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "M_WCuesgPRdSQ_zqaUtz0", + "type": "arrow" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "3", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 353, + "versionNonce": 354283240, + "index": "b6G", + "isDeleted": false, + "id": "lSEPKkiY8if2M9pDun8DS", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1071.5335963856091, + "y": 354.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 932194828, + "groupIds": [ + "Z8bVLPerSCYHViV4Ld1Ed", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 145, + "versionNonce": 56362904, + "index": "b6Q", + "isDeleted": false, + "id": "9Bwc8DwyPnrOxUQpApvfU", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 257.30863987315786, + "y": 383.5312500000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 103.71990966796875, + "height": 50, + "seed": 1385699816, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528426042, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "PDF \ndocuments", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "PDF \ndocuments", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 494, + "versionNonce": 1068503272, + "index": "b6R", + "isDeleted": false, + "id": "QSiEFZIoz081ipwdmU8sg", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 251.34862552989614, + "y": 242.95738636363643, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 1529123224, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "_Z-rRn1k6dRs-cBIHwwQY" + }, + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1737528651437, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 265, + "versionNonce": 1790196968, + "index": "b6S", + "isDeleted": false, + "id": "_Z-rRn1k6dRs-cBIHwwQY", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 265.2249946594238, + "y": 247.95738636363643, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 19.519989013671875, + "height": 25, + "seed": 13541016, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "QSiEFZIoz081ipwdmU8sg", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 505, + "versionNonce": 48835560, + "index": "b6T", + "isDeleted": false, + "id": "3xE7duRO9Qq4Sc-G2OvNv", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 246.89408007535064, + "y": 148.3210227272728, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 1605307288, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Vb3hONt1wd7JHFzI3HmrQ" + } + ], + "updated": 1737528540117, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 262, + "versionNonce": 1551754904, + "index": "b6U", + "isDeleted": false, + "id": "Vb3hONt1wd7JHFzI3HmrQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 263.03044371171427, + "y": 153.3210227272728, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 15, + "height": 25, + "seed": 1106892952, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528540117, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "3xE7duRO9Qq4Sc-G2OvNv", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 525, + "versionNonce": 225964696, + "index": "b6V", + "isDeleted": false, + "id": "ooV7vvmtMmdPRnQmMHBmf", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 249.98498916625965, + "y": 196.50284090909093, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 191038872, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "_rMbVkq-GLuJSkRWHvjkn" + } + ], + "updated": 1737528539700, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 227, + "versionNonce": 472392424, + "index": "b6W", + "isDeleted": false, + "id": "_rMbVkq-GLuJSkRWHvjkn", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 265.1213528026233, + "y": 201.50284090909093, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 17, + "height": 25, + "seed": 152998552, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "ooV7vvmtMmdPRnQmMHBmf", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 510, + "versionNonce": 768826600, + "index": "b6X", + "isDeleted": false, + "id": "JUjlPmSPagKyAA6ikwVcf", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 254.16680734807767, + "y": 287.59375000000006, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 1105231768, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Tov62fM0_erGxbIhudlqt" + }, + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1737528566266, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 214, + "versionNonce": 1140033000, + "index": "b6Y", + "isDeleted": false, + "id": "Tov62fM0_erGxbIhudlqt", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 269.3031709844413, + "y": 292.59375000000006, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 17, + "height": 25, + "seed": 1172098200, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "JUjlPmSPagKyAA6ikwVcf", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 578, + "versionNonce": 1264463000, + "index": "b6Z", + "isDeleted": false, + "id": "4cU98zwq8Qi78OlWyES2s", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 255.71226189353263, + "y": 331.2414772727274, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 2127002008, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "hDWulD4JcLixt2n_PIyWF" + } + ], + "updated": 1737528539700, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 284, + "versionNonce": 1113229544, + "index": "b6a", + "isDeleted": false, + "id": "hDWulD4JcLixt2n_PIyWF", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 272.34862552989625, + "y": 336.2414772727274, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 14, + "height": 25, + "seed": 2144634520, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "4cU98zwq8Qi78OlWyES2s", + "originalText": "C", + "autoResize": true, "lineHeight": 1.25 }, { - "type": "text", - "version": 70, - "versionNonce": 247294132, - "index": "b69", + "type": "image", + "version": 295, + "versionNonce": 1682243816, + "index": "b6d", "isDeleted": false, - "id": "lkM4ke2d8E4KSisX5yE08", + "id": "XH-Rt0Q5-K2g4tM9reh76", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 762.5454545454546, - "y": 429.51136363636374, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 64.55995178222656, - "height": 25, - "seed": 1905848653, + "x": 510.8409090909091, + "y": 143.88636363636368, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 60.17910447761194, + "height": 60.17910447761194, + "seed": 1159948140, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "KGVjVuaPc35r3zwmLpo6p" ], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708776347, + "boundElements": [ + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + } + ], + "updated": 1737528662022, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "chunks", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "chunks", - "autoResize": true, - "lineHeight": 1.25 + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ], + "crop": null }, { - "type": "rectangle", - "version": 527, - "versionNonce": 1269467404, - "index": "b698", + "type": "image", + "version": 344, + "versionNonce": 276052968, + "index": "b6e", "isDeleted": false, - "id": "JNHVvikjirDDllCKotbJC", + "id": "YFlD_rDw6IwCctPG9BjYf", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1025.9545454545455, - "y": 275.68750000000006, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 848769955, + "x": 510.8409090909091, + "y": 209.70725915875175, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 60.17910447761194, + "height": 60.17910447761194, + "seed": 1369151980, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "KGVjVuaPc35r3zwmLpo6p" ], "frameId": null, - "roundness": { - "type": 3 - }, + "roundness": null, "boundElements": [ { - "type": "text", - "id": "8Msc7tXcZdg2UUH2NmUn-" + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + }, + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" } ], - "updated": 1726708934863, + "updated": 1737528663639, "link": null, - "locked": false + "locked": false, + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ], + "crop": null }, { - "type": "text", - "version": 287, - "versionNonce": 1779271564, - "index": "b69G", + "type": "image", + "version": 375, + "versionNonce": 1533627624, + "index": "b6f", "isDeleted": false, - "id": "8Msc7tXcZdg2UUH2NmUn-", + "id": "7R-AwuwB2mlKHQ4TA3v7g", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1040.6509142788973, - "y": 280.68750000000006, - "strokeColor": "#1e1e1e", + "x": 507.5390491822035, + "y": 280.3521455223882, + "strokeColor": "transparent", "backgroundColor": "transparent", - "width": 17.879989624023438, - "height": 25, - "seed": 1297532739, + "width": 60.17910447761194, + "height": 60.17910447761194, + "seed": 1189477272, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "KGVjVuaPc35r3zwmLpo6p" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737528662023, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "A'", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "JNHVvikjirDDllCKotbJC", - "originalText": "A'", - "autoResize": true, - "lineHeight": 1.25 + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ], + "crop": null }, { "type": "rectangle", - "version": 565, - "versionNonce": 1888269836, - "index": "b69O", + "version": 804, + "versionNonce": 602477288, + "index": "b6g", "isDeleted": false, - "id": "fkbHGW5tJ-Ay0sh8h-9hJ", + "id": "e4ecV_y0ryxDQzzpC-xuB", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1022.5, - "y": 182.05113636363643, + "x": 1480.6454339460893, + "y": 499.97869318181824, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 2116216547, + "seed": 1087979672, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": { @@ -1821,40 +2558,40 @@ "boundElements": [ { "type": "text", - "id": "BNiP4zX7PtFTn_e_5vXX3" + "id": "uQnFGHOdIKBjcans1vzUh" } ], - "updated": 1726708934863, + "updated": 1737530585213, "link": null, "locked": false }, { "type": "text", - "version": 308, - "versionNonce": 1814172812, - "index": "b69V", + "version": 548, + "versionNonce": 957607832, + "index": "b6h", "isDeleted": false, - "id": "BNiP4zX7PtFTn_e_5vXX3", + "id": "uQnFGHOdIKBjcans1vzUh", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1039.3763691295276, - "y": 187.05113636363643, + "x": 1496.7817975824528, + "y": 504.97869318181824, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 13.519989013671875, + "width": 15, "height": 25, - "seed": 1804210819, + "seed": 1242918296, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585213, "link": null, "locked": false, "fontSize": 20, @@ -1862,33 +2599,33 @@ "text": "A", "textAlign": "center", "verticalAlign": "middle", - "containerId": "fkbHGW5tJ-Ay0sh8h-9hJ", + "containerId": "e4ecV_y0ryxDQzzpC-xuB", "originalText": "A", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 558, - "versionNonce": 981967628, - "index": "b69d", + "version": 797, + "versionNonce": 102135272, + "index": "b6i", "isDeleted": false, - "id": "QYKbNgibs7-HxaNNr8tfG", + "id": "_NOEhFqnCLHtq6yXXa5Ft", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1024.590909090909, - "y": 229.23295454545456, + "x": 1482.7363430369983, + "y": 547.1605113636365, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1716177443, + "seed": 356776600, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": { @@ -1897,40 +2634,40 @@ "boundElements": [ { "type": "text", - "id": "C-rwFmAbwI_qgVqpkXy7m" + "id": "J3LCjL2uxV-fjOQWF1Nyl" } ], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 249, - "versionNonce": 1916232076, - "index": "b69l", + "version": 489, + "versionNonce": 1696742552, + "index": "b6j", "isDeleted": false, - "id": "C-rwFmAbwI_qgVqpkXy7m", + "id": "J3LCjL2uxV-fjOQWF1Nyl", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1040.6172797463155, - "y": 234.23295454545456, + "x": 1497.8727066733618, + "y": 552.1605113636365, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 17, "height": 25, - "seed": 592678339, + "seed": 1964566424, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -1938,33 +2675,33 @@ "text": "B", "textAlign": "center", "verticalAlign": "middle", - "containerId": "QYKbNgibs7-HxaNNr8tfG", + "containerId": "_NOEhFqnCLHtq6yXXa5Ft", "originalText": "B", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 653, - "versionNonce": 1248546828, - "index": "b69t", + "version": 910, + "versionNonce": 580876520, + "index": "b6k", "isDeleted": false, - "id": "m2Wj9fp76PKCAhrulCmTa", - "fillStyle": "solid", + "id": "JQQ2WM4JRpHcVDQ6tWh9E", + "fillStyle": "cross-hatch", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1027.318181818182, - "y": 365.97159090909105, + "x": 1488.4636157642713, + "y": 601.899147727273, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 901963107, + "seed": 1170748568, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": { @@ -1973,40 +2710,40 @@ "boundElements": [ { "type": "text", - "id": "MNgTOO1UYazXucNSjXZ_z" + "id": "-t96Vcbd_pHmWnfG-tPFY" } ], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 348, - "versionNonce": 52260492, - "index": "b6A", + "version": 602, + "versionNonce": 1943988632, + "index": "b6l", "isDeleted": false, - "id": "MNgTOO1UYazXucNSjXZ_z", + "id": "-t96Vcbd_pHmWnfG-tPFY", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1044.6645521684127, - "y": 370.97159090909105, + "x": 1505.0999794006348, + "y": 606.899147727273, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 12.579986572265625, + "width": 14, "height": 25, - "seed": 1223112963, + "seed": 1023795608, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2014,38 +2751,38 @@ "text": "C", "textAlign": "center", "verticalAlign": "middle", - "containerId": "m2Wj9fp76PKCAhrulCmTa", + "containerId": "JQQ2WM4JRpHcVDQ6tWh9E", "originalText": "C", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 127, - "versionNonce": 1292352780, - "index": "b6AG", + "version": 365, + "versionNonce": 1829772264, + "index": "b6m", "isDeleted": false, - "id": "J1KVE_C00rdGo7FWIwu1X", + "id": "VdLIGckmm2zBfC3i4wvrn", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 998.7954545454545, - "y": 188.01136363636374, + "x": 1455.9408884915438, + "y": 505.9389204545456, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 12, "height": 25, - "seed": 1442121325, + "seed": 973467288, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2060,31 +2797,36 @@ }, { "type": "text", - "version": 181, - "versionNonce": 832846732, - "index": "b6AV", + "version": 424, + "versionNonce": 1974063512, + "index": "b6n", "isDeleted": false, - "id": "TIEDsM4QhNNDJARAJnvDz", + "id": "KCk9Ks3UrLoOid_qWtcKt", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1001.7954545454545, - "y": 234.26136363636374, + "x": 1459.9408884915438, + "y": 552.1889204545457, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, "height": 25, - "seed": 846611715, + "seed": 360471448, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708934863, + "boundElements": [ + { + "id": "uJzNGI-VzOHyMa0kMCtyo", + "type": "arrow" + } + ], + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2099,382 +2841,289 @@ }, { "type": "text", - "version": 229, - "versionNonce": 2066541068, - "index": "b6Al", - "isDeleted": false, - "id": "tGvqUuD_kCzfMYn-UX8o-", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1004.2954545454545, - "y": 283.01136363636374, - "strokeColor": "#e03131", - "backgroundColor": "transparent", - "width": 12, - "height": 25, - "seed": 758667053, - "groupIds": [ - "ssihZCwGeFNCQehvjAg06" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708934863, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 8, - "text": "3", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "3", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "text", - "version": 360, - "versionNonce": 479971468, - "index": "b6B", + "version": 611, + "versionNonce": 125066984, + "index": "b6o", "isDeleted": false, - "id": "IQM8OVr381UGBDKQtda8U", + "id": "uc2hgh9lXoidExmskulnJ", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1004.0454545454545, - "y": 371.26136363636374, + "x": 1461.1908884915438, + "y": 612.1889204545457, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, "height": 25, - "seed": 618433805, + "seed": 1906124952, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, "fontFamily": 8, "text": "5", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "5", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 611, - "versionNonce": 430626572, - "index": "b6BV", - "isDeleted": false, - "id": "fJGd6Pf-SaTmbDMUGHhUW", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1028.3972327492456, - "y": 322.2812500000001, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 1491526540, - "groupIds": [ - "ssihZCwGeFNCQehvjAg06" - ], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "Ax-8fSsrXvrkMhlGAgJgO" - } - ], - "updated": 1726708934863, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 302, - "versionNonce": 1859392908, - "index": "b6C", - "isDeleted": false, - "id": "Ax-8fSsrXvrkMhlGAgJgO", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1044.423603404652, - "y": 327.2812500000001, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 15.219985961914062, - "height": 25, - "seed": 1943704076, - "groupIds": [ - "ssihZCwGeFNCQehvjAg06" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708934863, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "B", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "fJGd6Pf-SaTmbDMUGHhUW", - "originalText": "B", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 259, - "versionNonce": 2035385356, - "index": "b6CV", + "version": 552, + "versionNonce": 531850136, + "index": "b6p", "isDeleted": false, - "id": "07qZABiLS71UbigBsFpnK", + "id": "vbXyYItXCJiZ95GHEna2G", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1002.0335963856091, - "y": 327.2812500000001, - "strokeColor": "#e03131", + "x": 1432.8286670338025, + "y": 661.083806818182, + "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 11, + "width": 197.33984375, "height": 25, - "seed": 1965424820, + "seed": 169629080, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, - "fontFamily": 8, - "text": "4", + "fontFamily": 5, + "text": "C is marked as spam", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "4", + "originalText": "C is marked as spam", "autoResize": true, "lineHeight": 1.25 }, { + "id": "-CNAjEmW6cbufb2V3aXbb", "type": "arrow", - "version": 2600, - "versionNonce": 1259679372, - "index": "b6D", - "isDeleted": false, - "id": "M_WCuesgPRdSQ_zqaUtz0", + "x": 1388.4659090909088, + "y": 250.5312500000001, + "width": 113.16269233010075, + "height": 228, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, - "angle": 0, - "x": 1113.5321305627851, - "y": 279.97561555378826, - "strokeColor": "#2f9e44", - "backgroundColor": "transparent", - "width": 154.2895204048931, - "height": 2.3372664247598323, - "seed": 1489010356, "groupIds": [], "frameId": null, - "roundness": { - "type": 2 - }, + "index": "b6q", + "roundness": null, + "seed": 1354092264, + "version": 165, + "versionNonce": 464680344, + "isDeleted": false, "boundElements": [], - "updated": 1726708895234, + "updated": 1737530583905, "link": null, "locked": false, - "startBinding": null, - "endBinding": null, - "lastCommittedPoint": null, - "startArrowhead": null, - "endArrowhead": "arrow", "points": [ [ 0, 0 ], [ - 154.2895204048931, - 2.3372664247598323 + 113.16269233010075, + 0 + ], + [ + 113.16269233010075, + 228 ] - ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "NzWqph0M7tEkeTDKLPGZR", + "focus": 0.4253246753246783, + "gap": 5.000000000000114, + "fixedPoint": [ + 1.1057692307692308, + 0.7126623376623391 + ] + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true }, { "type": "text", - "version": 176, - "versionNonce": 14571020, - "index": "b6E", + "version": 1099, + "versionNonce": 1108693656, + "index": "b6s", "isDeleted": false, - "id": "wkavhEPwz2TNGwf8xFeLA", + "id": "ocrQNX8WLBEF3z4H5qV1Q", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", - "roughness": 1, + "roughness": 0, "opacity": 100, "angle": 0, - "x": 1263.0335963856091, - "y": 188.2812500000001, - "strokeColor": "#e03131", - "backgroundColor": "transparent", - "width": 12, - "height": 25, - "seed": 809955212, - "groupIds": [ - "uHtPh4-PiLJtgc-p_Cdgo" - ], + "x": 1506.5825046192517, + "y": 291.4184149825713, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 135.80796813964844, + "height": 58.225670034857664, + "seed": 1216046568, + "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708942969, + "updated": 1737529134305, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 8, - "text": "1", + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "5. document\nquality", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "1", + "originalText": "5. document\nquality", "autoResize": true, "lineHeight": 1.25 }, { - "type": "rectangle", - "version": 538, - "versionNonce": 1071049484, - "index": "b6F", + "type": "arrow", + "version": 1524, + "versionNonce": 2138633960, + "index": "b6t", "isDeleted": false, - "id": "Qaz1byDgzm-0ZrVLBmU4v", + "id": "uJzNGI-VzOHyMa0kMCtyo", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", - "roughness": 1, + "roughness": 0, "opacity": 100, "angle": 0, - "x": 1288.9545454545455, - "y": 273.1875000000001, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 144156909, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], + "x": 1450.701621813599, + "y": 572.658384798537, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "width": 231.1460407851796, + "height": 1.29512872695625, + "seed": 772325608, + "groupIds": [], "frameId": null, "roundness": { - "type": 3 + "type": 2 }, - "boundElements": [ - { - "type": "text", - "id": "D2HbgzHXdGyxGppwaWbBy" - } - ], - "updated": 1726708966705, + "boundElements": [], + "updated": 1737530585216, "link": null, - "locked": false + "locked": false, + "startBinding": { + "elementId": "KCk9Ks3UrLoOid_qWtcKt", + "focus": -0.6425776620043193, + "gap": 9.23926667794467, + "fixedPoint": null + }, + "endBinding": { + "elementId": "TL7ufCnIHYiHVmKWJljll", + "focus": 0.14400907570834828, + "gap": 5.546510718694094, + "fixedPoint": null + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -231.1460407851796, + -1.29512872695625 + ] + ] }, { "type": "text", - "version": 296, - "versionNonce": 2108300212, - "index": "b6G", + "version": 1200, + "versionNonce": 800272536, + "index": "b6u", "isDeleted": false, - "id": "D2HbgzHXdGyxGppwaWbBy", + "id": "AWSDUNN6IaU5NZQ1ScgSU", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", - "roughness": 1, + "roughness": 0, "opacity": 100, "angle": 0, - "x": 1303.6509142788973, - "y": 278.1875000000001, + "x": 1276.7246173511853, + "y": 540.4184149825712, "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 17.879989624023438, - "height": 25, - "seed": 2062418765, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], + "backgroundColor": "#b2f2bb", + "width": 124.44776916503906, + "height": 58.225670034857664, + "seed": 1343739368, + "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "A'", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "Qaz1byDgzm-0ZrVLBmU4v", - "originalText": "A'", + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "6. removing\nspam ..etc", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "6. removing\nspam ..etc", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 569, - "versionNonce": 509454732, - "index": "b6H", + "version": 896, + "versionNonce": 1019725032, + "index": "b6v", "isDeleted": false, - "id": "-LxVJeZLqj0MgI5FEg_pm", + "id": "Rdnl5GxK4pFbFoTLI-oOG", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1281.5, - "y": 179.55113636363643, + "x": 1164.6454339460893, + "y": 503.97869318181824, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1514803629, + "seed": 1661634456, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": { @@ -2483,41 +3132,40 @@ "boundElements": [ { "type": "text", - "id": "trFDjiJr6cfNlCSEKqNjE" + "id": "gfBsltp4ourNC3Fnk9ClO" } ], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 311, - "versionNonce": 1054115124, - "index": "b6I", + "version": 640, + "versionNonce": 674323864, + "index": "b6w", "isDeleted": false, - "id": "trFDjiJr6cfNlCSEKqNjE", + "id": "gfBsltp4ourNC3Fnk9ClO", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1298.3763691295276, - "y": 184.55113636363643, + "x": 1180.7817975824528, + "y": 508.97869318181824, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 13.519989013671875, + "width": 15, "height": 25, - "seed": 1674925069, + "seed": 1149621400, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2525,34 +3173,33 @@ "text": "A", "textAlign": "center", "verticalAlign": "middle", - "containerId": "-LxVJeZLqj0MgI5FEg_pm", + "containerId": "Rdnl5GxK4pFbFoTLI-oOG", "originalText": "A", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 566, - "versionNonce": 713594892, - "index": "b6J", + "version": 892, + "versionNonce": 1875358696, + "index": "b6x", "isDeleted": false, - "id": "Kxu9owye4gMpRvh7kJ1Nl", + "id": "TL7ufCnIHYiHVmKWJljll", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1287.590909090909, - "y": 226.73295454545456, + "x": 1166.7363430369983, + "y": 551.1605113636365, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1938377325, + "seed": 1393525144, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": { @@ -2561,41 +3208,44 @@ "boundElements": [ { "type": "text", - "id": "UP92rSYiIXnnBFhov6WNx" + "id": "Qs_O62O1HCrusz6mXeH8i" + }, + { + "id": "uJzNGI-VzOHyMa0kMCtyo", + "type": "arrow" } ], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 256, - "versionNonce": 301317812, - "index": "b6K", + "version": 581, + "versionNonce": 711060120, + "index": "b6y", "isDeleted": false, - "id": "UP92rSYiIXnnBFhov6WNx", + "id": "Qs_O62O1HCrusz6mXeH8i", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1303.6172797463157, - "y": 231.73295454545456, + "x": 1181.8727066733618, + "y": 556.1605113636365, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 17, "height": 25, - "seed": 707753165, + "seed": 500928152, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2603,206 +3253,125 @@ "text": "B", "textAlign": "center", "verticalAlign": "middle", - "containerId": "Kxu9owye4gMpRvh7kJ1Nl", + "containerId": "TL7ufCnIHYiHVmKWJljll", "originalText": "B", "autoResize": true, "lineHeight": 1.25 }, - { - "type": "rectangle", - "version": 593, - "versionNonce": 5355148, - "index": "b6L", - "isDeleted": false, - "id": "KMOsOR4pOx-ute2ztnw1k", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1293.318181818182, - "y": 361.4715909090911, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 635317229, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "SsRO-f6mzQzf5jQOudz6C" - } - ], - "updated": 1726708966705, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 287, - "versionNonce": 800311348, - "index": "b6M", - "isDeleted": false, - "id": "SsRO-f6mzQzf5jQOudz6C", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1310.6645521684127, - "y": 366.4715909090911, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 12.579986572265625, - "height": 25, - "seed": 1382819405, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708966705, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "C", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "KMOsOR4pOx-ute2ztnw1k", - "originalText": "C", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "text", - "version": 206, - "versionNonce": 745735436, - "index": "b6N", + "version": 457, + "versionNonce": 351906536, + "index": "b71", "isDeleted": false, - "id": "US1PK13ekocRlMvOrHSJL", + "id": "h9eneFYpYcKGCUroEQPXT", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1265.0335963856091, - "y": 231.2812500000001, + "x": 1139.9408884915438, + "y": 509.9389204545456, "strokeColor": "#e03131", "backgroundColor": "transparent", - "width": 11, + "width": 12, "height": 25, - "seed": 1525760780, + "seed": 2119562648, "groupIds": [ - "bQ__H1TgpJXskAm32UBLZ", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, "fontFamily": 8, - "text": "2", + "text": "1", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "2", + "originalText": "1", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 241, - "versionNonce": 1274323380, - "index": "b6O", + "version": 514, + "versionNonce": 284743576, + "index": "b72", "isDeleted": false, - "id": "NxUqy-MsYDga_9XDrU9l7", + "id": "2FH_CC-PbldTPMTV0l3zg", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1267.5335963856091, - "y": 277.2812500000001, + "x": 1143.9408884915438, + "y": 556.1889204545457, "strokeColor": "#e03131", "backgroundColor": "transparent", - "width": 12, + "width": 11, "height": 25, - "seed": 1116920372, + "seed": 3375768, "groupIds": [ - "4mN8vM1PMjtKHfzWdqXES", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, "fontFamily": 8, - "text": "3", + "text": "2", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "3", + "originalText": "2", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 240, - "versionNonce": 342262668, - "index": "b6P", + "version": 639, + "versionNonce": 961809896, + "index": "b74", "isDeleted": false, - "id": "lSEPKkiY8if2M9pDun8DS", + "id": "tn954yHWPQx-IDIpEMxaF", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1270.5335963856091, - "y": 370.2812500000001, - "strokeColor": "#e03131", + "x": 1116.8286670338025, + "y": 665.083806818182, + "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 11, + "width": 135.03990173339844, "height": 25, - "seed": 932194828, + "seed": 1349893272, "groupIds": [ - "Z8bVLPerSCYHViV4Ld1Ed", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, - "fontFamily": 8, - "text": "5", + "fontFamily": 5, + "text": "Spam removed", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "5", + "originalText": "Spam removed", "autoResize": true, "lineHeight": 1.25 } diff --git a/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png new file mode 100644 index 000000000..f40893ac1 Binary files /dev/null and b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png differ diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb new file mode 100644 index 000000000..e6b4cb951 --- /dev/null +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb @@ -0,0 +1,2938 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Processing PDFs using Data Prep Kit\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow:\n", + "\n", + "- pdf2parquet: Extract text from PDF documents\n", + "- docid: compute hashes\n", + "- exact dedupe : filter out identical documents\n", + "- fuzzy dedupe : filter out 'near duplicates'\n", + "- document quality: scoring documents for quality\n", + "\n", + "![](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] + }, + { + "cell_type": "markdown", + "id": "39a0ab6e", + "metadata": { + "id": "39a0ab6e" + }, + "source": [ + "## Step-1: Figure out Runtime Environment\n", + "\n", + "### 1.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1fe354b7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1fe354b7", + "outputId": "39cc4e90-b230-4100-92c9-3aa3d977fa3d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "a5dc2b68", + "metadata": { + "id": "a5dc2b68" + }, + "source": [ + "### 1.2 - Install dependencies if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1fcec577", + "metadata": { + "id": "1fcec577" + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit-transforms[all]==1.0.0 \\\n", + " humanfriendly" + ] + }, + { + "cell_type": "markdown", + "id": "243322b8", + "metadata": { + "id": "243322b8" + }, + "source": [ + "### 1.3 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration & Utils" + ] + }, + { + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e4YMZrBuFycl", + "outputId": "ad7fc57a-5229-4841-8d8a-23272aa5197d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "63d1d197-dfb1-4d6f-eb88-846bbbff1446" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " input_dir = \"input\"\n", + " shutil.os.makedirs(input_dir, exist_ok=True)\n", + "else:\n", + " input_dir = \"../../data-files/pdf-processing-1/\"\n", + " \n", + "output_dir = \"output\"\n", + "\n", + "output_pdf2pq_dir = os.path.join (output_dir, '01_pdf2pq_out')\n", + "output_docid_dir = os.path.join (output_dir, '02_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (output_dir, '03_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (output_dir, '04_fuzzy_dedupe_out')\n", + "output_doc_quality_dir = os.path.join (output_dir, '05_doc_quality_out')\n", + "output_final_dir = os.path.join (output_dir, 'output_final')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(output_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_dir, exist_ok=True)\n", + "print (\"โœ… Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "14b2f34c", + "metadata": { + "id": "14b2f34c" + }, + "source": [ + "### 2.3 - Handy Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ba47a370", + "metadata": { + "id": "ba47a370" + }, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from humanfriendly import format_size\n", + "import pandas as pd\n", + "import glob\n", + "\n", + "## Reads parquet files in a folder into a pandas dataframe\n", + "def read_parquet_files_as_df (parquet_dir):\n", + " parquet_files = glob.glob(f'{parquet_dir}/*.parquet')\n", + " # read each parquet file into a DataFrame and store in a list\n", + " dfs = [pd.read_parquet (f) for f in parquet_files]\n", + " dfs = [df for df in dfs if not df.empty] # filter out empty dataframes\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " if len(dfs) > 0:\n", + " data_df = pd.concat(dfs, ignore_index=True)\n", + " return data_df\n", + " else:\n", + " return pd.DataFrame() # return empty df\n", + "# ------------\n", + "\n", + "\n", + "def download_file(url, local_file, chunk_size=1024*1024):\n", + " \"\"\"\n", + " Downloads a remote URL to a local file.\n", + "\n", + " Args:\n", + " url (str): The remote URL.\n", + " local_filename (str): The name of the local file to save the downloaded content.\n", + " chunk_size (int): The size in bytes of each chunk. Defaults to 1024.\n", + "\n", + " Returns:\n", + " None\n", + "\n", + " Example usage:\n", + " download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB\n", + " \"\"\"\n", + " # Check if the local file already exists\n", + " if os.path.exists(local_file):\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"Local file '{local_file}' ({file_size}) already exists. Skipping download.\")\n", + " return\n", + "\n", + " # Create the directory if it doesn't exist\n", + " os.makedirs(os.path.dirname(local_file), exist_ok=True)\n", + "\n", + " # Stream the file download\n", + " with requests.get(url, stream=True) as r:\n", + " r.raise_for_status()\n", + " with open(local_file, 'wb') as f:\n", + " for chunk in r.iter_content(chunk_size=chunk_size):\n", + " if chunk: # filter out keep-alive new chunks\n", + " f.write(chunk)\n", + " print()\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"{local_file} ({file_size}) downloaded successfully.\")\n", + "## --- end: download_file ------\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "dc1972c3", + "metadata": { + "id": "dc1972c3" + }, + "source": [ + "## Step-3: Inspect the Data\n", + "\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/)\n", + "\n", + "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + ] + }, + { + "cell_type": "markdown", + "id": "7113b16c", + "metadata": { + "id": "7113b16c" + }, + "source": [ + "### 3.1 -Download Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "23db1064", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "23db1064", + "outputId": "d871231d-86e2-4db7-a437-1510047bef2a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using input files from : ../../data-files/pdf-processing-1/\n" + ] + } + ], + "source": [ + "if RUNNING_IN_COLAB:\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + "else:\n", + " print ('Using input files from : ', input_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-4: Extract Data from PDF (pdf2parquet)\n", + "\n", + "This step we will read PDF files and extract the text data.\n", + "\n", + "[Pdf2Parquet documentation](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/README.md)\n", + "\n", + "We use the [Docling package](https://github.com/DS4SD/docling).\n" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 4.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 836, + "referenced_widgets": [ + "df5c199339f5467a91453fa187e201f0", + "257dbf0b62624667b0c82afaf1c8ccf1", + "4e76bef9228546fd97cccfe7bdd856f3", + "c0c37c0262b84e9ebf02c1ce17f263ee", + "ca821137125b45d08e257f95822a6f72", + "fb81f32569c34250b901235698e5ea18", + "1ce164863aa34f64a94aeb5d05103043", + "e2b5f84c30de45d29588a07a3d106eb4", + "cc7d3125eb55461180566d1064eeb2a5", + "68eb811a52804887bc383e89a72a0975", + "55b9873ce1f34c169ecc6087c3cd65a1" + ] + }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "da48c24e-c32c-4fc9-e6aa-37b1921c3d4d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='../../data-files/pdf-processing-1/' --> output='output/01_pdf2pq_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:06:13 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "23:06:13 INFO - pipeline id pipeline_id\n", + "23:06:13 INFO - code location None\n", + "23:06:13 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "23:06:13 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "23:06:13 INFO - orchestrator pdf2parquet started at 2025-02-04 23:06:13\n", + "23:06:13 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "23:06:13 INFO - Initializing models\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "21a1c68550c848cba79340080a1ccde4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 9 files: 0%| | 0/9 [00:00 output='{output_pdf2pq_dir}'\\n\", flush=True)\n", + "\n", + "result = Pdf2Parquet(input_folder= input_dir,\n", + " output_folder= output_pdf2pq_dir,\n", + " data_files_to_use=['.pdf'],\n", + " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "5ca790e0", + "metadata": { + "id": "5ca790e0" + }, + "source": [ + "### 4.2 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fe59563d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 557 + }, + "id": "fe59563d", + "outputId": "81b70c9f-cc39-4f78-f29f-f81d4fcf19ae" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/01_pdf2pq_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filename
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum10252b1cdf4-b1ef-4375-8e6b-23f174592c066571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:06:20.4705440.693593lorem-ipsum.pdf
1spam.pdfFree xxx102854dca5d-9db5-4ea5-b2e5-bddd176bf1b810026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:06:21.8198930.676735spam.pdf
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011875d0907-8dd3-4ef9-b3b0-a0083e7ad43810729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:06:19.7749150.641045earth2.pdf
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10116264e62a-0121-4cd4-8202-ea6e228e15f17758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:06:21.1412300.668992mars.pdf
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011582bc53b-96e2-4b09-8dd7-6a27a685a53e14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:06:18.1998031.053618earth-copy.pdf
5earth.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011c6c18475-9365-4325-85dc-8acf6b969d8f14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:06:19.1320900.929218earth.pdf
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "5 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", + "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "5 c6c18475-9365-4325-85dc-8acf6b969d8f 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", + "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "5 2025-02-04T23:06:19.132090 0.929218 earth.pdf " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_pdf2pq_dir)\n", + "output_df = read_parquet_files_as_df(output_pdf2pq_dir)\n", + "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "output_df.head(10)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 4.3 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **document_hash**: hash of documents\n", + "- **hash** : hash of `contents` column\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "**Note: you should notice the hash values are identical for the duplicate documents**\n", + "\n", + "Let's inspect the **contents** column." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f870e624", + "outputId": "8064d9df-c226-4795-b9ad-34d50709a8c3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Earth\n", + "\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, ๏ฌve dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "\n", + "For more details about our Solar system see Chapter 1.\n", + "\n", + "## Earth\n", + "\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "\n", + "Basic facts about Earth:\n", + "\n", + "- ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "- ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "- ยท Rotation Period: 24 hours (one day)\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'earth.pdf'].iloc[0,]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e1a10c2d", + "outputId": "3dbf4e39-1c4c-443e-968c-32aae9010165" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Free xxx\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'spam.pdf'].iloc[0,]['contents'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b37dd994", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lorem ipsum Lorem ipsum Lorem ipsum\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'lorem-ipsum.pdf'].iloc[0,]['contents'])" + ] + }, + { + "cell_type": "markdown", + "id": "7fc86d5b", + "metadata": { + "id": "7fc86d5b" + }, + "source": [ + "## Step-5: Create DOC ID for Documents\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This step is a pre-requisite for fuzzy dedup** in the pipeline.\n", + "\n", + "[DocID documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/doc_id)" + ] + }, + { + "cell_type": "markdown", + "id": "f516a253", + "metadata": { + "id": "f516a253" + }, + "source": [ + "### 5.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cee20521", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cee20521", + "outputId": "dd568017-e39c-4524-cdcf-6c97a1341ab9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_pdf2pq_out' --> output='output/02_docid_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:06:22 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator doc_id started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:2 completed successfully\n", + "CPU times: user 27.6 ms, sys: 2.32 ms, total: 29.9 ms\n", + "Wall time: 23.4 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_doc_id.transform_python import DocID\n", + "\n", + "STAGE = 2\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_pdf2pq_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", + "\n", + "result = DocID(input_folder= output_pdf2pq_dir,\n", + " output_folder= output_docid_dir,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"doc_hash\",\n", + " # doc_id_int_column= \"doc_id_int\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " #doc_id_start_id= 5\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "4bd6f382", + "metadata": { + "id": "4bd6f382" + }, + "source": [ + "### 5.2 - Inspect Generated output\n", + "\n", + "You would see a new columns **doc_hash** and **int_id_column**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f3d4aba9", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 557 + }, + "id": "f3d4aba9", + "outputId": "b4b868b3-ebc7-48a2-f0c5-b0b023a24238" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/02_docid_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_column
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum10252b1cdf4-b1ef-4375-8e6b-23f174592c066571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:06:20.4705440.693593lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3
1spam.pdfFree xxx102854dca5d-9db5-4ea5-b2e5-bddd176bf1b810026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:06:21.8198930.676735spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011875d0907-8dd3-4ef9-b3b0-a0083e7ad43810729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:06:19.7749150.641045earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10116264e62a-0121-4cd4-8202-ea6e228e15f17758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:06:21.1412300.668992mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011582bc53b-96e2-4b09-8dd7-6a27a685a53e14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:06:18.1998031.053618earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0
5earth.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011c6c18475-9365-4325-85dc-8acf6b969d8f14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:06:19.1320900.929218earth.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...1
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "5 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", + "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "5 c6c18475-9365-4325-85dc-8acf6b969d8f 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", + "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "5 2025-02-04T23:06:19.132090 0.929218 earth.pdf \n", + "\n", + " doc_hash int_id_column \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "c55f8d3f", + "metadata": { + "id": "c55f8d3f" + }, + "source": [ + "## Step-6: Eliminate Duplicate Documents\n", + "\n", + "We have 2 exact duplicates: **earth.pdf** , **earth-copy.pdf**\n", + "\n", + "Note how **doc_hash** for these documents are the same.\n", + "\n", + "[Exact dedupe information](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/ededup)" + ] + }, + { + "cell_type": "markdown", + "id": "6f5ef1f7", + "metadata": { + "id": "6f5ef1f7" + }, + "source": [ + "### 6.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "90eddb4c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "90eddb4c", + "outputId": "61221177-f23e-4daa-8e34-237582fc19b0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_docid_out' --> output='output/03_exact_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:06:22 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator ededup started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "23:06:22 INFO - Starting from the beginning\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:3 completed successfully\n", + "CPU times: user 37.3 ms, sys: 3.56 ms, total: 40.9 ms\n", + "Wall time: 36.4 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_ededup.transform_python import Ededup\n", + "\n", + "STAGE = 3\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Ededup(input_folder=output_docid_dir,\n", + " output_folder=output_exact_dedupe_dir,\n", + " ededup_doc_column=\"contents\",\n", + " ededup_doc_id_column=\"doc_hash\"\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4aacf09", + "metadata": { + "id": "f4aacf09" + }, + "source": [ + "### 6.2 - Inspect Generated output\n", + "\n", + "You can see one of **earth.pdf** or **earth-copy.pdf** will be eliminated." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1887b26d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 611 + }, + "id": "1887b26d", + "outputId": "31210411-1abd-418a-c1d9-167770788d62" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 6\n", + "Output files after exact dedupe : 5\n", + "Duplicate files removed : 1\n", + "Displaying contents of : output/03_exact_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum10252b1cdf4-b1ef-4375-8e6b-23f174592c066571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:06:20.4705440.693593lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx102854dca5d-9db5-4ea5-b2e5-bddd176bf1b810026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:06:21.8198930.676735spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011875d0907-8dd3-4ef9-b3b0-a0083e7ad43810729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:06:19.7749150.641045earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2[]
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10116264e62a-0121-4cd4-8202-ea6e228e15f17758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:06:21.1412300.668992mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011582bc53b-96e2-4b09-8dd7-6a27a685a53e14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:06:18.1998031.053618earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", + "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", + "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "76ea34e2", + "metadata": { + "id": "76ea34e2" + }, + "source": [ + "## Step-7: Fuzzy Dedupe\n", + "\n", + "In previous step, we removed **exact duplicates (identical documents)**.\n", + "\n", + "Fuzzy de-dupe can further filter out documents that are **not exactly identical, but nearly identical**\n", + "\n", + "Here is a simple example:\n", + "\n", + "`Our solar system is a vast and fascinating expanse`\n", + "\n", + "`The solar system is a vast and fascinating expanse`\n", + "\n", + "Only one word is different `Our` vs `The`.\n", + "\n", + "Imagine two documents with one extra blank line. For our purposes they are the same.\n", + "\n", + "[Fuzzy dedupe documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/fdedup)\n", + "\n", + "### Tweaking fuzzy matches\n", + "\n", + "**`jaccard_similarity_threshold`** is the parameter used to tweak similarities between documents. It's value is between 0 and 1.0. Values close to 1.0 means more strict checking (fewer documents will qualify). Lower threshold means more leniant matches (more documents will qualify)\n", + "\n", + "Adjust this value to find what works for your documents" + ] + }, + { + "cell_type": "markdown", + "id": "79a37713", + "metadata": { + "id": "79a37713" + }, + "source": [ + "### 7.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "37430b60", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "37430b60", + "outputId": "48366a20-f5c2-4040-bf56-8b29ce40ed53" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_exact_dedupe_out' --> output='output/04_fuzzy_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:06:22 INFO - Starting SignatureCalculation step\n", + "23:06:22 INFO - Got parameters for SignatureCalculation\n", + "23:06:22 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "23:06:22 INFO - data factory scdata_ is using local configuration without input/output path\n", + "23:06:22 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator minhash started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 WARNING - table is empty, skipping processing\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - Starting flush()\n", + "23:06:22 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "23:06:22 INFO - done flushing in 0.028 sec\n", + "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:06:22 INFO - SignatureCalculation completed successfully\n", + "23:06:22 INFO - Starting ClusterAnalysis step\n", + "23:06:22 INFO - Got parameters for ClusterAnalysis\n", + "23:06:22 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator cluster started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of folders is 14\n", + "23:06:22 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "23:06:22 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "23:06:22 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 8 files (57.14%) in 0.0 min\n", + "23:06:22 INFO - Completed 9 files (64.29%) in 0.001 min\n", + "23:06:22 INFO - Completed 10 files (71.43%) in 0.001 min\n", + "23:06:22 INFO - Completed 11 files (78.57%) in 0.001 min\n", + "23:06:22 INFO - Completed 12 files (85.71%) in 0.001 min\n", + "23:06:22 INFO - Completed 13 files (92.86%) in 0.001 min\n", + "23:06:22 INFO - Completed 14 files (100.0%) in 0.001 min\n", + "23:06:22 INFO - Done processing 14 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:06:22 INFO - ClusterAnalysis completed successfully\n", + "23:06:22 INFO - Starting GetDuplicateList step\n", + "23:06:22 INFO - Got parameters for GetDuplicateList\n", + "23:06:22 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator fdlist started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of folders is 1\n", + "23:06:22 INFO - Get Duplicate List for folder docs_to_remove\n", + "23:06:22 INFO - 1 documents marked as duplicates\n", + "23:06:22 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 1 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n", + "23:06:22 INFO - GetDuplicateList completed successfully\n", + "23:06:22 INFO - Starting DataCleaning step\n", + "23:06:22 INFO - Got parameters for DataCleaning\n", + "23:06:22 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "23:06:22 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "23:06:22 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator fdclean started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 WARNING - table is empty, skipping processing\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.001 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:06:22 INFO - DataCleaning completed successfully\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 288 ms, sys: 114 ms, total: 402 ms\n", + "Wall time: 262 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_fdedup.transform_python import Fdedup\n", + "\n", + "STAGE = 4\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_exact_dedupe_dir}' --> output='{output_fuzzy_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Fdedup(input_folder=output_exact_dedupe_dir,\n", + " output_folder=output_fuzzy_dedupe_dir,\n", + " contents_column= \"contents\",\n", + " # document_id_column= \"doc_id\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " jaccard_similarity_threshold = 0.8, # between 0 - 1. higher means more strict checking\n", + " operation_mode=\"filter_duplicates\",\n", + " # operation_mode=\"annotate\",\n", + " ).transform()\n", + "# if result == 0:\n", + "# print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "# else:\n", + "# raise Exception (f\"โŒ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "b2c83592", + "metadata": { + "id": "b2c83592" + }, + "source": [ + "### 7.2 - Inspect Output\n", + "\n", + "FuzzyDedupe will write documents that are filtered in **output/04_fuzzy_dedupe_out/cleaned** folder\n", + "\n", + "You will notice only one **earth.pdf** made it! So fuzzy dedupe did filter out the almost identical doc." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "573faba2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 511 + }, + "id": "573faba2", + "outputId": "49408c6e-a22b-404f-ccc5-c00edb7ce85a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 5\n", + "Output files after exact dedupe : 4\n", + "Near duplicate files removed : 1\n", + "Displaying contents of : output/04_fuzzy_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum10252b1cdf4-b1ef-4375-8e6b-23f174592c066571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:06:20.4705440.693593lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx102854dca5d-9db5-4ea5-b2e5-bddd176bf1b810026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:06:21.8198930.676735spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10116264e62a-0121-4cd4-8202-ea6e228e15f17758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:06:21.1412300.668992mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011582bc53b-96e2-4b09-8dd7-6a27a685a53e14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:06:18.1998031.053618earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "3 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "output_df = read_parquet_files_as_df(os.path.join(output_fuzzy_dedupe_dir, \"cleaned\"))\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "3e0598a0", + "metadata": { + "id": "3e0598a0" + }, + "source": [ + "## Step-8: Document Quality\n", + "\n", + "This handy plugin will score documents across many metrics.\n", + "\n", + "Here we will look for 'bad words' metric.\n", + "\n", + "[Document quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)\n", + "\n", + "By default it uses [bad words collection](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality/dpk_doc_quality/ldnoobw). You can supply a custom file by passing an argument `bad_word_filepath=/path/to/badwords_file`" + ] + }, + { + "cell_type": "markdown", + "id": "1949c2c4", + "metadata": { + "id": "1949c2c4" + }, + "source": [ + "### 8.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b485f598", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b485f598", + "outputId": "448a8ee1-9371-4bd4-f5ad-a596893fe65f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/04_fuzzy_dedupe_out/cleaned' --> output='output/05_doc_quality_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:06:22 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "23:06:22 INFO - data factory docq_ is using local configuration without input/output path\n", + "23:06:22 INFO - data factory docq_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator docq started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "23:06:22 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "23:06:22 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "23:06:22 WARNING - table is empty, skipping processing\n", + "23:06:22 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 5 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:5 completed successfully\n", + "CPU times: user 41.8 ms, sys: 1.98 ms, total: 43.7 ms\n", + "Wall time: 36.3 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_doc_quality.transform_python import DocQuality\n", + "\n", + "STAGE = 5\n", + "output_fuzzy_dedupe_cleaned_dir = os.path.join(output_fuzzy_dedupe_dir, \"cleaned\")\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_fuzzy_dedupe_cleaned_dir}' --> output='{output_doc_quality_dir}'\\n\", flush=True)\n", + "\n", + "result = DocQuality(input_folder=output_fuzzy_dedupe_cleaned_dir,\n", + " output_folder= output_doc_quality_dir,\n", + " docq_text_lang = \"en\",\n", + " docq_doc_content_column =\"contents\",\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "eccefd3e", + "metadata": { + "id": "eccefd3e" + }, + "source": [ + "### 8.2 - Inspect the Output\n", + "\n", + "We will see several new columns starting with the name **docq_**.\n", + "\n", + "Look at the column **docq_contain_bad_word**; this will flag documents with 'bad words'.\n", + "\n", + "Also inspect the column **docq_lorem_ipsum_ratio**; this will flag documents with 'lorem ipsum' text\n", + "\n", + "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1f3225f8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 485 + }, + "id": "1f3225f8", + "outputId": "a6009dc0-6ca6-411a-8066-090c610860e0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/05_doc_quality_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum10252b1cdf4-b1ef-4375-8e6b-23f174592c066571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...35...5.0000000.00000010.0857140.0False0.0000000.01.000000False
1spam.pdfFree xxx102854dca5d-9db5-4ea5-b2e5-bddd176bf1b810026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...8...3.5000000.00000010.0000000.0True0.0000000.01.000000False
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10116264e62a-0121-4cd4-8202-ea6e228e15f17758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.0000000.0False0.1764710.00.880000True
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011582bc53b-96e2-4b09-8dd7-6a27a685a53e14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.0000000.0False0.1764710.00.880734True
\n", + "

4 rows ร— 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "0 5.000000 0.000000 1 \n", + "1 3.500000 0.000000 1 \n", + "2 4.688000 0.032000 8 \n", + "3 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "0 0.085714 0.0 False \n", + "1 0.000000 0.0 True \n", + "2 0.000000 0.0 False \n", + "3 0.000000 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "0 0.000000 0.0 \n", + "1 0.000000 0.0 \n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "0 1.000000 False \n", + "1 1.000000 False \n", + "2 0.880000 True \n", + "3 0.880734 True \n", + "\n", + "[4 rows x 27 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "print (\"Displaying contents of : \", output_doc_quality_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "02fa3bd2", + "metadata": { + "id": "02fa3bd2" + }, + "source": [ + "### 8.3 - Filtering 'quality' documents\n", + "\n", + "So from the output above we see **spam.pdf** is flagged for containing bad words (**docq_contain_bad_word=True**).\n", + "\n", + "Also **lorem.pdf** is flagged for place holder content **lorem ipsum** (**docq_lorem_ipsum_ratio > 0**)\n", + "\n", + "We are going to filter them both out" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5dac1c70", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "5dac1c70", + "outputId": "463e897f-1099-410a-f753-34c4846228c3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10116264e62a-0121-4cd4-8202-ea6e228e15f17758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.00.0False0.1764710.00.880000True
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011582bc53b-96e2-4b09-8dd7-6a27a685a53e14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.00.0False0.1764710.00.880734True
\n", + "

2 rows ร— 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "2 4.688000 0.032000 8 \n", + "3 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "2 0.0 0.0 False \n", + "3 0.0 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "2 0.880000 True \n", + "3 0.880734 True \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "\n", + "# remove documents with badwords\n", + "clean_docs_df = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", + "\n", + "# also filter out 'lorem ipsum' text\n", + "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", + "\n", + "clean_docs_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207" + }, + "outputs": [], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(output_final_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_final_dir, exist_ok=True)\n", + "\n", + "output_final_dir_parquet = os.path.join (output_final_dir, 'pq')\n", + "shutil.os.makedirs(output_final_dir_parquet, exist_ok=True)\n", + "\n", + "output_final_dir_markdown = os.path.join (output_final_dir, 'markdown')\n", + "shutil.os.makedirs(output_final_dir_markdown, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e06ce4f2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e06ce4f2", + "outputId": "8a26e407-2cc8-44ee-ba6b-ca6485a92926" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Saved CLEAN parquet output to 'output/output_final/pq'\n" + ] + } + ], + "source": [ + "## save parquet\n", + "\n", + "clean_docs_df.to_parquet(os.path.join(output_final_dir_parquet, \"clean_docs.parquet\"))\n", + "print (f\"โœ… Saved CLEAN parquet output to '{output_final_dir_parquet}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "1e175302", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1e175302", + "outputId": "d54c5d80-23ce-49a6-e098-8e712d048975" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Saved CLEAN markdown output to 'output/output_final/markdown'\n" + ] + } + ], + "source": [ + "## save markdown text\n", + "\n", + "for index, row in clean_docs_df.iterrows():\n", + " output_file_name = os.path.join (output_final_dir_markdown, row['filename'] + '.md')\n", + " with open(output_file_name, 'w') as output_file:\n", + " output_file.write(row['contents'])\n", + "\n", + "print (f\"โœ… Saved CLEAN markdown output to '{output_final_dir_markdown}'\")\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-6-pdf-processing-r1.0.0-all-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1ce164863aa34f64a94aeb5d05103043": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "257dbf0b62624667b0c82afaf1c8ccf1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb81f32569c34250b901235698e5ea18", + "placeholder": "โ€‹", + "style": "IPY_MODEL_1ce164863aa34f64a94aeb5d05103043", + "value": "Fetchingโ€‡9โ€‡files:โ€‡100%" + } + }, + "4e76bef9228546fd97cccfe7bdd856f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e2b5f84c30de45d29588a07a3d106eb4", + "max": 9, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cc7d3125eb55461180566d1064eeb2a5", + "value": 9 + } + }, + "55b9873ce1f34c169ecc6087c3cd65a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "68eb811a52804887bc383e89a72a0975": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c0c37c0262b84e9ebf02c1ce17f263ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68eb811a52804887bc383e89a72a0975", + "placeholder": "โ€‹", + "style": "IPY_MODEL_55b9873ce1f34c169ecc6087c3cd65a1", + "value": "โ€‡9/9โ€‡[00:00<00:00,โ€‡220.49it/s]" + } + }, + "ca821137125b45d08e257f95822a6f72": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc7d3125eb55461180566d1064eeb2a5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "df5c199339f5467a91453fa187e201f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_257dbf0b62624667b0c82afaf1c8ccf1", + "IPY_MODEL_4e76bef9228546fd97cccfe7bdd856f3", + "IPY_MODEL_c0c37c0262b84e9ebf02c1ce17f263ee" + ], + "layout": "IPY_MODEL_ca821137125b45d08e257f95822a6f72" + } + }, + "e2b5f84c30de45d29588a07a3d106eb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fb81f32569c34250b901235698e5ea18": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb new file mode 100644 index 000000000..04ed0fad4 --- /dev/null +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb @@ -0,0 +1,2904 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Processing PDFs using Data Prep Kit (Ray version)\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow:\n", + "\n", + "- pdf2parquet: Extract text from PDF documents\n", + "- docid: compute hashes\n", + "- exact dedupe : filter out identical documents\n", + "- fuzzy dedupe : filter out 'near duplicates'\n", + "- document quality: scoring documents for quality\n", + "\n", + "![](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] + }, + { + "cell_type": "markdown", + "id": "25ef1be4", + "metadata": {}, + "source": [ + "## Step-1: Figure out Runtime Environment\n", + "\n", + "### 1.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "13c97768", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "df9594f1", + "metadata": {}, + "source": [ + "### 1.2 - Install dependencies if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dc538bc3", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit-transforms[ray,all]==1.0.0 \\\n", + " humanfriendly" + ] + }, + { + "cell_type": "markdown", + "id": "a34c5175", + "metadata": {}, + "source": [ + "### 1.3 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "113ed1a3", + "metadata": {}, + "source": [ + "## Step-2: Configuration & Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d4f57ff5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "970e692b", + "metadata": {}, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "74ed9531", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " input_dir = \"input\"\n", + " shutil.os.makedirs(input_dir, exist_ok=True)\n", + "else:\n", + " input_dir = \"../../data-files/pdf-processing-1/\"\n", + "\n", + "output_dir = \"output\"\n", + "\n", + "output_pdf2pq_dir = os.path.join (output_dir, '01_pdf2pq_out')\n", + "output_docid_dir = os.path.join (output_dir, '02_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (output_dir, '03_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (output_dir, '04_fuzzy_dedupe_out')\n", + "output_doc_quality_dir = os.path.join (output_dir, '05_doc_quality_out')\n", + "output_final_dir = os.path.join (output_dir, 'output_final')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(output_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_dir, exist_ok=True)\n", + "print (\"โœ… Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "3a3bf77f", + "metadata": {}, + "source": [ + "### 2.3 - Runtime Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "991f58d9", + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.utils import GB\n", + "\n", + "CONFIG_RAY_NUM_CPUS = 1 # CPUs per worker\n", + "CONFIG_RAY_MEMORY = 2 * GB # memory per worker\n", + "CONFIG_RAY_RUNTIME_WORKERS = 2" + ] + }, + { + "cell_type": "markdown", + "id": "f40af9e1", + "metadata": {}, + "source": [ + "### 2.4 - Handy Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "df47deb1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from humanfriendly import format_size\n", + "import pandas as pd\n", + "import glob\n", + "\n", + "## Reads parquet files in a folder into a pandas dataframe\n", + "def read_parquet_files_as_df (parquet_dir):\n", + " parquet_files = glob.glob(f'{parquet_dir}/*.parquet')\n", + " # read each parquet file into a DataFrame and store in a list\n", + " dfs = [pd.read_parquet (f) for f in parquet_files]\n", + " dfs = [df for df in dfs if not df.empty] # filter out empty dataframes\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " if len(dfs) > 0:\n", + " data_df = pd.concat(dfs, ignore_index=True)\n", + " return data_df\n", + " else:\n", + " return pd.DataFrame() # return empty df\n", + "# ------------\n", + "\n", + "\n", + "def download_file(url, local_file, chunk_size=1024*1024):\n", + " \"\"\"\n", + " Downloads a remote URL to a local file.\n", + "\n", + " Args:\n", + " url (str): The remote URL.\n", + " local_filename (str): The name of the local file to save the downloaded content.\n", + " chunk_size (int): The size in bytes of each chunk. Defaults to 1024.\n", + "\n", + " Returns:\n", + " None\n", + "\n", + " Example usage:\n", + " download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB\n", + " \"\"\"\n", + " # Check if the local file already exists\n", + " if os.path.exists(local_file):\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"Local file '{local_file}' ({file_size}) already exists. Skipping download.\")\n", + " return\n", + "\n", + " # Create the directory if it doesn't exist\n", + " os.makedirs(os.path.dirname(local_file), exist_ok=True)\n", + "\n", + " # Stream the file download\n", + " with requests.get(url, stream=True) as r:\n", + " r.raise_for_status()\n", + " with open(local_file, 'wb') as f:\n", + " for chunk in r.iter_content(chunk_size=chunk_size):\n", + " if chunk: # filter out keep-alive new chunks\n", + " f.write(chunk)\n", + " print()\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"{local_file} ({file_size}) downloaded successfully.\")\n", + "## --- end: download_file ------\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "f5be5e73", + "metadata": {}, + "source": [ + "## Step-3: Inspect the Data\n", + "\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/)\n", + "\n", + "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + ] + }, + { + "cell_type": "markdown", + "id": "b20947ae", + "metadata": {}, + "source": [ + "### 3.1 -Download Data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f4cc5e1f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using input files from : ../../data-files/pdf-processing-1/\n" + ] + } + ], + "source": [ + "if RUNNING_IN_COLAB:\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + "\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + "else:\n", + " print ('Using input files from : ', input_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "06fef91e", + "metadata": {}, + "source": [ + "## Step-4: Extract Data from PDF (pdf2parquet)\n", + "\n", + "This step we will read PDF files and extract the text data.\n", + "\n", + "[Pdf2Parquet documentation](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/README.md)\n", + "\n", + "We use the [Docling package](https://github.com/DS4SD/docling).\n" + ] + }, + { + "cell_type": "markdown", + "id": "b27cc402", + "metadata": {}, + "source": [ + "### 4.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "50f2c6a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-1: Processing input='../../data-files/pdf-processing-1/' --> output='output/01_pdf2pq_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:08:37 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "23:08:37 INFO - pipeline id pipeline_id\n", + "23:08:37 INFO - code location None\n", + "23:08:37 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:08:37 INFO - actor creation delay 0\n", + "23:08:37 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:08:37 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "23:08:37 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:08:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "23:08:37 INFO - Running locally\n", + "2025-02-04 23:08:38,509\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - orchestrator started at 2025-02-04 23:08:42\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.743361664935946, 'object_store': 4.371680831536651}\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=2171540)\u001b[0m 23:08:45 INFO - Initializing models\n", + "Fetching 9 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 9/9 [00:00<00:00, 1688.38it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=2171540)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:54 INFO - Completed 1 files in 0.031 min\n", + "\u001b[36m(RayTransformFileProcessor pid=2171541)\u001b[0m 23:08:45 INFO - Initializing models\n", + "Fetching 9 files: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 9/9 [00:00<00:00, 29723.41it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=2171541)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:54 INFO - Completed 2 files in 0.033 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 3 files in 0.062 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 4 files in 0.064 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 4 files (66.667%) in 0.064 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:58 INFO - Completed processing 6 files in 0.09 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:58 INFO - done flushing in 0.001 sec\n", + "23:09:08 INFO - Completed execution in 0.518 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:1 completed successfully\n" + ] + } + ], + "source": [ + "from dpk_pdf2parquet.ray.transform import Pdf2Parquet\n", + "from dpk_pdf2parquet.transform import pdf2parquet_contents_types\n", + "\n", + "STAGE = 1\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{input_dir}' --> output='{output_pdf2pq_dir}'\\n\", flush=True)\n", + "\n", + "\n", + "result = Pdf2Parquet(input_folder= input_dir,\n", + " output_folder= output_pdf2pq_dir,\n", + " data_files_to_use=['.pdf'],\n", + " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "159a5d67", + "metadata": {}, + "source": [ + "### 4.2 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "82f04cd9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/01_pdf2pq_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filename
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum1023618834f-9dfc-49a1-9066-e2724df95fec6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:08:56.8204441.846058lorem-ipsum.pdf
1spam.pdfFree xxx10227880888-8e1a-4b46-a6a9-fecba8eee0eb10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:08:58.4141201.590731spam.pdf
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101184f59118-2a64-4d4b-991c-10ca09576a7410729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:08:56.7134951.827202earth2.pdf
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101171ef93eb-66d3-4a21-bb47-cf4b85d4f8ff7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:08:58.2724961.547326mars.pdf
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10117ce227b2-66cb-4e2e-b76b-3a0a8c9d2f4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:08:54.8721451.864833earth-copy.pdf
5earth.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115b12b0e8-946f-4538-8812-9ee74204c2d714711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:08:54.9698281.962273earth.pdf
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "5 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "5 5b12b0e8-946f-4538-8812-9ee74204c2d7 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "5 2025-02-04T23:08:54.969828 1.962273 earth.pdf " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_pdf2pq_dir)\n", + "output_df = read_parquet_files_as_df(output_pdf2pq_dir)\n", + "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "output_df.head(10)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "56232298", + "metadata": {}, + "source": [ + "\n", + "### 4.3 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **document_hash**: hash of documents\n", + "- **hash** : hash of `contents` column\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "**Note: you should notice the hash values are identical for the duplicate documents**\n", + "\n", + "Let's inspect the **contents** column." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4bcc03dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Earth\n", + "\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, ๏ฌve dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "\n", + "For more details about our Solar system see Chapter 1.\n", + "\n", + "## Earth\n", + "\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "\n", + "Basic facts about Earth:\n", + "\n", + "- ยท Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "- ยท Moons: One moon, called Luna or simply \"the Moon\".\n", + "- ยท Rotation Period: 24 hours (one day)\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'earth.pdf'].iloc[0,]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9d07a30e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Free xxx\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'spam.pdf'].iloc[0,]['contents'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "866857df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lorem ipsum Lorem ipsum Lorem ipsum\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'lorem-ipsum.pdf'].iloc[0,]['contents'])" + ] + }, + { + "cell_type": "markdown", + "id": "270f1673", + "metadata": {}, + "source": [ + "## Step-5: Create DOC ID for Documents\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This step is a pre-requisite for fuzzy dedup** in the pipeline.\n", + "\n", + "[DocID documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/doc_id)" + ] + }, + { + "cell_type": "markdown", + "id": "32478bb0", + "metadata": {}, + "source": [ + "### 5.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9b0f613b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-2: Processing input='output/01_pdf2pq_out' --> output='output/02_docid_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:09:09 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "23:09:09 INFO - pipeline id pipeline_id\n", + "23:09:09 INFO - code location None\n", + "23:09:09 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:09:09 INFO - actor creation delay 0\n", + "23:09:09 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:09 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "23:09:09 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:09 INFO - Running locally\n", + "2025-02-04 23:09:10,988\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - orchestrator started at 2025-02-04 23:09:12\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.800101472064853, 'object_store': 4.400050735101104}\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed processing 6 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - done flushing in 0.001 sec\n", + "23:09:23 INFO - Completed execution in 0.226 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:2 completed successfully\n", + "CPU times: user 122 ms, sys: 132 ms, total: 254 ms\n", + "Wall time: 14.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_doc_id.ray.transform import DocID\n", + "\n", + "STAGE = 2\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_pdf2pq_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", + "\n", + "result = DocID(input_folder= output_pdf2pq_dir,\n", + " output_folder= output_docid_dir,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"doc_hash\",\n", + " # doc_id_int_column= \"doc_id_int\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + " \n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "af2de0e5", + "metadata": {}, + "source": [ + "### 5.2 - Inspect Generated output\n", + "\n", + "You would see a new columns **doc_hash** and **int_id_column**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "38b6e1cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/02_docid_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_column
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum1023618834f-9dfc-49a1-9066-e2724df95fec6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:08:56.8204441.846058lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3
1spam.pdfFree xxx10227880888-8e1a-4b46-a6a9-fecba8eee0eb10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:08:58.4141201.590731spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101184f59118-2a64-4d4b-991c-10ca09576a7410729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:08:56.7134951.827202earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101171ef93eb-66d3-4a21-bb47-cf4b85d4f8ff7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:08:58.2724961.547326mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10117ce227b2-66cb-4e2e-b76b-3a0a8c9d2f4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:08:54.8721451.864833earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...1
5earth.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115b12b0e8-946f-4538-8812-9ee74204c2d714711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:08:54.9698281.962273earth.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "5 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "5 5b12b0e8-946f-4538-8812-9ee74204c2d7 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "5 2025-02-04T23:08:54.969828 1.962273 earth.pdf \n", + "\n", + " doc_hash int_id_column \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "141f7cf1", + "metadata": {}, + "source": [ + "## Step-6: Eliminate Duplicate Documents\n", + "\n", + "We have 2 exact duplicates: **earth.pdf** , **earth-copy.pdf**\n", + "\n", + "Note how **doc_hash** for these documents are the same.\n", + "\n", + "[Exact dedupe information](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/ededup)" + ] + }, + { + "cell_type": "markdown", + "id": "eb74af84", + "metadata": {}, + "source": [ + "### 6.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "48beaa13", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-3: Processing input='output/02_docid_out' --> output='output/03_exact_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:09:24 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "23:09:24 INFO - pipeline id pipeline_id\n", + "23:09:24 INFO - code location None\n", + "23:09:24 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:09:24 INFO - actor creation delay 0\n", + "23:09:24 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:24 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "23:09:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:24 INFO - Running locally\n", + "2025-02-04 23:09:25,887\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - orchestrator started at 2025-02-04 23:09:27\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.715737915597856, 'object_store': 4.357868957333267}\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - done flushing in 0.001 sec\n", + "23:09:38 INFO - Completed execution in 0.226 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:3 completed successfully\n", + "CPU times: user 144 ms, sys: 164 ms, total: 308 ms\n", + "Wall time: 14.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_ededup.ray.transform import Ededup\n", + "\n", + "STAGE = 3\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Ededup(input_folder=output_docid_dir,\n", + " output_folder=output_exact_dedupe_dir,\n", + " ededup_doc_column=\"contents\",\n", + " ededup_doc_id_column=\"doc_hash\",\n", + " ededup_num_hashes= 2,\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "d9d93e16", + "metadata": {}, + "source": [ + "### 6.2 - Inspect Generated output\n", + "\n", + "You can see one of **earth.pdf** or **earth-copy.pdf** will be eliminated." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ef98911d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 6\n", + "Output files after exact dedupe : 5\n", + "Duplicate files removed : 1\n", + "Displaying contents of : output/03_exact_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum1023618834f-9dfc-49a1-9066-e2724df95fec6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:08:56.8204441.846058lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx10227880888-8e1a-4b46-a6a9-fecba8eee0eb10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:08:58.4141201.590731spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101184f59118-2a64-4d4b-991c-10ca09576a7410729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:08:56.7134951.827202earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2[]
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101171ef93eb-66d3-4a21-bb47-cf4b85d4f8ff7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:08:58.2724961.547326mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10117ce227b2-66cb-4e2e-b76b-3a0a8c9d2f4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:08:54.8721451.864833earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...1[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 [] " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "1cedeca2", + "metadata": {}, + "source": [ + "## Step-7: Fuzzy Dedupe\n", + "\n", + "In previous step, we removed **exact duplicates (identical documents)**.\n", + "\n", + "Fuzzy de-dupe can further filter out documents that are **not exactly identical, but nearly identical**\n", + "\n", + "Here is a simple example:\n", + "\n", + "`Our solar system is a vast and fascinating expanse`\n", + "\n", + "`The solar system is a vast and fascinating expanse`\n", + "\n", + "Only one word is different `Our` vs `The`.\n", + "\n", + "Imagine two documents with one extra blank line. For our purposes they are the same.\n", + "\n", + "[Fuzzy dedupe documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/fdedup)\n", + "\n", + "### Tweaking fuzzy matches\n", + "\n", + "**`jaccard_similarity_threshold`** is the parameter used to tweak similarities between documents. It's value is between 0 and 1.0. Values close to 1.0 means more strict checking (fewer documents will qualify). Lower threshold means more leniant matches (more documents will qualify)\n", + "\n", + "Adjust this value to find what works for your documents" + ] + }, + { + "cell_type": "markdown", + "id": "3f21d132", + "metadata": {}, + "source": [ + "### 7.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f6430f24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-4: Processing input='output/03_exact_dedupe_out' --> output='output/04_fuzzy_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:09:39 INFO - Starting SignatureCalculation step\n", + "23:09:39 INFO - Got parameters for SignatureCalculation\n", + "23:09:39 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "23:09:39 INFO - data factory scdata_ is using local configuration without input/output path\n", + "23:09:39 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "23:09:39 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:39 INFO - pipeline id pipeline_id\n", + "23:09:39 INFO - code location None\n", + "23:09:39 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:09:39 INFO - actor creation delay 0\n", + "23:09:39 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:39 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:09:39 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:39 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:39 INFO - Running locally\n", + "2025-02-04 23:09:40,737\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - orchestrator started at 2025-02-04 23:09:41\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.727081298828125, 'object_store': 4.3635406494140625}\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - done flushing in 0.026 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 WARNING - table is empty, skipping processing\n", + "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 INFO - Starting flush()\n", + "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", + "23:09:53 INFO - Completed execution in 0.224 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=2176343)\u001b[0m 23:09:43 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=2176343)\u001b[0m 23:09:43 INFO - Wrote 14 tables with a total size of 13,440 bytes\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "23:09:54 INFO - SignatureCalculation completed successfully\n", + "23:09:54 INFO - Starting ClusterAnalysis step\n", + "23:09:54 INFO - Got parameters for ClusterAnalysis\n", + "23:09:54 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "23:09:54 INFO - pipeline id pipeline_id\n", + "23:09:54 INFO - code location None\n", + "23:09:54 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:09:54 INFO - actor creation delay 0\n", + "23:09:54 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:54 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "23:09:54 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:54 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:54 INFO - Running locally\n", + "2025-02-04 23:09:55,736\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - orchestrator started at 2025-02-04 23:09:56\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.763642883859575, 'object_store': 4.381821441464126}\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 7 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 8 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 9 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 10 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 11 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed processing 14 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - done flushing in 0.001 sec\n", + "23:10:08 INFO - Completed execution in 0.222 min, execution result 0\n", + "23:10:09 INFO - ClusterAnalysis completed successfully\n", + "23:10:09 INFO - Starting GetDuplicateList step\n", + "23:10:09 INFO - Got parameters for GetDuplicateList\n", + "23:10:09 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "23:10:09 INFO - pipeline id pipeline_id\n", + "23:10:09 INFO - code location None\n", + "23:10:09 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:10:09 INFO - actor creation delay 0\n", + "23:10:09 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:10:09 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:10:09 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:10:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:09 INFO - Running locally\n", + "2025-02-04 23:10:10,430\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - orchestrator started at 2025-02-04 23:10:11\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.75473709218204, 'object_store': 4.3773685451596975}\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2179461)\u001b[0m 23:10:12 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=2179461)\u001b[0m 23:10:12 INFO - 0 documents marked as duplicates\n", + "23:10:22 INFO - Completed execution in 0.223 min, execution result 0\n", + "23:10:24 INFO - GetDuplicateList completed successfully\n", + "23:10:24 INFO - Starting DataCleaning step\n", + "23:10:24 INFO - Got parameters for DataCleaning\n", + "23:10:24 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "23:10:24 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "23:10:24 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "23:10:24 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:24 INFO - pipeline id pipeline_id\n", + "23:10:24 INFO - code location None\n", + "23:10:24 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:10:24 INFO - actor creation delay 0\n", + "23:10:24 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:10:24 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "23:10:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:10:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:24 INFO - Running locally\n", + "2025-02-04 23:10:25,111\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - orchestrator started at 2025-02-04 23:10:26\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.744503784924746, 'object_store': 4.37225189153105}\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2180888)\u001b[0m 23:10:27 WARNING - table is empty, skipping processing\n", + "23:10:37 INFO - Completed execution in 0.224 min, execution result 0\n", + "23:10:38 INFO - DataCleaning completed successfully\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 603 ms, sys: 679 ms, total: 1.28 s\n", + "Wall time: 59.2 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_fdedup.ray.transform import Fdedup\n", + "\n", + "STAGE = 4\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_exact_dedupe_dir}' --> output='{output_fuzzy_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Fdedup(input_folder=output_exact_dedupe_dir,\n", + " output_folder=output_fuzzy_dedupe_dir,\n", + " contents_column= \"contents\",\n", + " # document_id_column= \"doc_id\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " jaccard_similarity_threshold = 0.9, # between 0 - 1. higher means more strict checking\n", + " operation_mode=\"filter_duplicates\",\n", + " # operation_mode=\"annotate\",\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " ).transform()\n", + "\n", + "# if result == 0:\n", + "# print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "# else:\n", + "# raise Exception (f\"โŒ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "037d3974", + "metadata": {}, + "source": [ + "### 7.2 - Inspect Output\n", + "\n", + "FuzzyDedupe will write documents that are filtered in **output/04_fuzzy_dedupe_out/cleaned** folder\n", + "\n", + "You will notice only one **earth.pdf** made it! So fuzzy dedupe did filter out the almost identical doc." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d59496f0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 5\n", + "Output files after exact dedupe : 5\n", + "Near duplicate files removed : 0\n", + "Displaying contents of : output/04_fuzzy_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum1023618834f-9dfc-49a1-9066-e2724df95fec6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-02-04T23:08:56.8204441.846058lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx10227880888-8e1a-4b46-a6a9-fecba8eee0eb10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-02-04T23:08:58.4141201.590731spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101184f59118-2a64-4d4b-991c-10ca09576a7410729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-02-04T23:08:56.7134951.827202earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2[]
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101171ef93eb-66d3-4a21-bb47-cf4b85d4f8ff7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-02-04T23:08:58.2724961.547326mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10117ce227b2-66cb-4e2e-b76b-3a0a8c9d2f4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-02-04T23:08:54.8721451.864833earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...1[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 [] " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "output_df = read_parquet_files_as_df(os.path.join(output_fuzzy_dedupe_dir, \"cleaned\"))\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "c3e4f860", + "metadata": {}, + "source": [ + "## Step-8: Document Quality\n", + "\n", + "This handy plugin will score documents across many metrics.\n", + "\n", + "Here we will look for 'bad words' metric.\n", + "\n", + "[Document quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)\n", + "\n", + "By default it uses [bad words collection](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality/dpk_doc_quality/ldnoobw). You can supply a custom file by passing an argument `bad_word_filepath=/path/to/badwords_file`" + ] + }, + { + "cell_type": "markdown", + "id": "144a0fff", + "metadata": {}, + "source": [ + "### 8.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "63140942", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿƒ๐Ÿผ STAGE-5: Processing input='output/04_fuzzy_dedupe_out/cleaned' --> output='output/05_doc_quality_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "23:10:38 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "23:10:38 INFO - data factory docq_ is using local configuration without input/output path\n", + "23:10:38 INFO - data factory docq_ max_files -1, n_sample -1\n", + "23:10:38 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:38 INFO - pipeline id pipeline_id\n", + "23:10:38 INFO - code location None\n", + "23:10:38 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:10:38 INFO - actor creation delay 0\n", + "23:10:38 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:10:38 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "23:10:38 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:10:38 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:38 INFO - Running locally\n", + "2025-02-04 23:10:39,863\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - orchestrator started at 2025-02-04 23:10:41\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.04752826690673828}\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.727170563302934, 'object_store': 4.363585281185806}\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=2182506)\u001b[0m 23:10:41 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed processing 5 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - done flushing in 0.001 sec\n", + "23:10:52 INFO - Completed execution in 0.223 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=2182507)\u001b[0m 23:10:41 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Stage:5 completed successfully\n", + "CPU times: user 116 ms, sys: 125 ms, total: 240 ms\n", + "Wall time: 14.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_doc_quality.ray.transform import DocQuality\n", + "\n", + "STAGE = 5\n", + "output_fuzzy_dedupe_cleaned_dir = os.path.join(output_fuzzy_dedupe_dir, \"cleaned\")\n", + "print (f\"๐Ÿƒ๐Ÿผ STAGE-{STAGE}: Processing input='{output_fuzzy_dedupe_cleaned_dir}' --> output='{output_doc_quality_dir}'\\n\", flush=True)\n", + "\n", + "result = DocQuality(input_folder=output_fuzzy_dedupe_cleaned_dir,\n", + " output_folder= output_doc_quality_dir,\n", + " docq_text_lang = \"en\",\n", + " docq_doc_content_column =\"contents\",\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"โœ… Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"โŒ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "1006b475", + "metadata": {}, + "source": [ + "### 8.2 - Inspect the Output\n", + "\n", + "We will see several new columns starting with the name **docq_**.\n", + "\n", + "Look at the column **docq_contain_bad_word**; this will flag documents with 'bad words'.\n", + "\n", + "Also inspect the column **docq_lorem_ipsum_ratio**; this will flag documents with 'lorem ipsum' text\n", + "\n", + "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "24181587", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/05_doc_quality_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum1023618834f-9dfc-49a1-9066-e2724df95fec6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...35...5.0000000.00000010.0857140.0False0.0000000.01.000000False
1spam.pdfFree xxx10227880888-8e1a-4b46-a6a9-fecba8eee0eb10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...8...3.5000000.00000010.0000000.0True0.0000000.01.000000False
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101184f59118-2a64-4d4b-991c-10ca09576a7410729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...610...4.5412840.02752390.0000000.0False0.1764710.00.880734True
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101171ef93eb-66d3-4a21-bb47-cf4b85d4f8ff7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.0000000.0False0.1764710.00.880000True
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10117ce227b2-66cb-4e2e-b76b-3a0a8c9d2f4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.0000000.0False0.1764710.00.880734True
\n", + "

5 rows ร— 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 ... \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "0 5.000000 0.000000 1 \n", + "1 3.500000 0.000000 1 \n", + "2 4.541284 0.027523 9 \n", + "3 4.688000 0.032000 8 \n", + "4 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "0 0.085714 0.0 False \n", + "1 0.000000 0.0 True \n", + "2 0.000000 0.0 False \n", + "3 0.000000 0.0 False \n", + "4 0.000000 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "0 0.000000 0.0 \n", + "1 0.000000 0.0 \n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "4 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "0 1.000000 False \n", + "1 1.000000 False \n", + "2 0.880734 True \n", + "3 0.880000 True \n", + "4 0.880734 True \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "print (\"Displaying contents of : \", output_doc_quality_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c343b656", + "metadata": {}, + "source": [ + "### 8.3 - Filtering 'quality' documents\n", + "\n", + "So from the output above we see **spam.pdf** is flagged for containing bad words (**docq_contain_bad_word=True**).\n", + "\n", + "Also **lorem.pdf** is flagged for place holder content **lorem ipsum** (**docq_lorem_ipsum_ratio > 0**)\n", + "\n", + "We are going to filter them both out" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4b3dee53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101184f59118-2a64-4d4b-991c-10ca09576a7410729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...610...4.5412840.02752390.00.0False0.1764710.00.880734True
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101171ef93eb-66d3-4a21-bb47-cf4b85d4f8ff7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.00.0False0.1764710.00.880000True
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10117ce227b2-66cb-4e2e-b76b-3a0a8c9d2f4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.00.0False0.1764710.00.880734True
\n", + "

3 rows ร— 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 ... \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "2 4.541284 0.027523 9 \n", + "3 4.688000 0.032000 8 \n", + "4 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "2 0.0 0.0 False \n", + "3 0.0 0.0 False \n", + "4 0.0 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "4 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "2 0.880734 True \n", + "3 0.880000 True \n", + "4 0.880734 True \n", + "\n", + "[3 rows x 27 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "\n", + "# remove documents with badwords\n", + "clean_docs_df = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", + "\n", + "# also filter out 'lorem ipsum' text\n", + "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", + "\n", + "clean_docs_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "5861461a", + "metadata": {}, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "8d1b50f7", + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(output_final_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_final_dir, exist_ok=True)\n", + "\n", + "output_final_dir_parquet = os.path.join (output_final_dir, 'pq')\n", + "shutil.os.makedirs(output_final_dir_parquet, exist_ok=True)\n", + "\n", + "output_final_dir_markdown = os.path.join (output_final_dir, 'markdown')\n", + "shutil.os.makedirs(output_final_dir_markdown, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ba897dd9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Saved CLEAN parquet output to 'output/output_final/pq'\n" + ] + } + ], + "source": [ + "## save parquet\n", + "\n", + "clean_docs_df.to_parquet(os.path.join(output_final_dir_parquet, \"clean_docs.parquet\"))\n", + "print (f\"โœ… Saved CLEAN parquet output to '{output_final_dir_parquet}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "867bb0f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Saved CLEAN markdown output to 'output/output_final/markdown'\n" + ] + } + ], + "source": [ + "## save markdown text\n", + "\n", + "for index, row in clean_docs_df.iterrows():\n", + " output_file_name = os.path.join (output_final_dir_markdown, row['filename'] + '.md')\n", + " with open(output_file_name, 'w') as output_file:\n", + " output_file.write(row['contents'])\n", + "\n", + "print (f\"โœ… Saved CLEAN markdown output to '{output_final_dir_markdown}'\")\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-6-pdf-processing-r1.0.0-all-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06107a2f48b3491f91bbe84e46e10ba0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68997339f13240a4824a9e416096bee4", + "placeholder": "โ€‹", + "style": "IPY_MODEL_919b086abd314077bbff75687392bd91", + "value": "" + } + }, + "68997339f13240a4824a9e416096bee4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c08de2dd9a2402c90b1a7a645db9b13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7e13e8779a81400f996d4428c74acfaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91fff81a1de8487c9009e872b751edb0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ada62d24cbcf4361acbb21808f334d33", + "value": "โ€‡0/0โ€‡[00:00<?,โ€‡?it/s]" + } + }, + "8b7571c585df431eb901fcdebdf8177e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_06107a2f48b3491f91bbe84e46e10ba0", + "IPY_MODEL_bd74356eca18423aa0373c808d9097e3", + "IPY_MODEL_7e13e8779a81400f996d4428c74acfaf" + ], + "layout": "IPY_MODEL_a75892696be546a3970962bae7bf732a" + } + }, + "919b086abd314077bbff75687392bd91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "91fff81a1de8487c9009e872b751edb0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a75892696be546a3970962bae7bf732a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ada62d24cbcf4361acbb21808f334d33": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b4c209371e7a403986991a786cfb296d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "bd74356eca18423aa0373c808d9097e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b4c209371e7a403986991a786cfb296d", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c08de2dd9a2402c90b1a7a645db9b13", + "value": 0 + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/pdf-processing-1/requirements.txt b/examples/notebooks/pdf-processing-1/requirements.txt new file mode 100644 index 000000000..ffd42dafe --- /dev/null +++ b/examples/notebooks/pdf-processing-1/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit-transforms[ray,all]==1.0.0 + +# jupyter +jupyterlab +ipykernel +ipywidgets