From afffaa2134a1325f77c6661f6e05d4b1f34c94ed Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Tue, 21 Jan 2025 00:22:02 -0800 Subject: [PATCH 1/6] examples/notebooks/pdf-processing-1 - upgraded to simpler API - redid the processing flow - expanded the examples to include doc-quality plugin Signed-off-by: Sujee Maniyam --- .../notebooks/intro/dpk_intro_1_python.ipynb | 3667 ----------------- .../{intro => pdf-processing-1}/.gitignore | 0 .../{intro => pdf-processing-1}/README.md | 23 +- .../archived}/my_utils.py | 0 .../data-prep-kit-3-workflow.excalidraw | 0 .../images/data-prep-kit-3-workflow.png | Bin .../pdf-processing-1/input/earth-copy.pdf | Bin 0 -> 58535 bytes .../input}/earth.md | 0 .../input}/earth.pdf | Bin 58535 -> 58535 bytes .../pdf-processing-1/input/earth2.md | 18 + .../pdf-processing-1/input/earth2.pdf | Bin 0 -> 58532 bytes .../notebooks/pdf-processing-1/input/lorem.md | 3 + .../pdf-processing-1/input/lorem.pdf | Bin 0 -> 25723 bytes .../input}/mars.md | 0 .../input}/mars.pdf | Bin 57872 -> 57872 bytes .../notebooks/pdf-processing-1/input/spam.md | 1 + .../notebooks/pdf-processing-1/input/spam.pdf | Bin 0 -> 24868 bytes .../pdf_processing_1_python.ipynb | 2661 ++++++++++++ .../pdf_processing_1_ray.ipynb} | 0 19 files changed, 2700 insertions(+), 3673 deletions(-) delete mode 100644 examples/notebooks/intro/dpk_intro_1_python.ipynb rename examples/notebooks/{intro => pdf-processing-1}/.gitignore (100%) rename examples/notebooks/{intro => pdf-processing-1}/README.md (66%) rename examples/notebooks/{intro => pdf-processing-1/archived}/my_utils.py (100%) rename examples/notebooks/{intro => pdf-processing-1}/images/data-prep-kit-3-workflow.excalidraw (100%) rename examples/notebooks/{intro => pdf-processing-1}/images/data-prep-kit-3-workflow.png (100%) create mode 100644 examples/notebooks/pdf-processing-1/input/earth-copy.pdf rename examples/notebooks/{intro/input/solar-system => pdf-processing-1/input}/earth.md (100%) rename examples/notebooks/{intro/input/solar-system => pdf-processing-1/input}/earth.pdf (99%) create mode 100644 examples/notebooks/pdf-processing-1/input/earth2.md create mode 100644 examples/notebooks/pdf-processing-1/input/earth2.pdf create mode 100644 examples/notebooks/pdf-processing-1/input/lorem.md create mode 100644 examples/notebooks/pdf-processing-1/input/lorem.pdf rename examples/notebooks/{intro/input/solar-system => pdf-processing-1/input}/mars.md (100%) rename examples/notebooks/{intro/input/solar-system => pdf-processing-1/input}/mars.pdf (99%) create mode 100644 examples/notebooks/pdf-processing-1/input/spam.md create mode 100644 examples/notebooks/pdf-processing-1/input/spam.pdf create mode 100644 examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb rename examples/notebooks/{intro/dpk_intro_1_ray.ipynb => pdf-processing-1/pdf_processing_1_ray.ipynb} (100%) diff --git a/examples/notebooks/intro/dpk_intro_1_python.ipynb b/examples/notebooks/intro/dpk_intro_1_python.ipynb deleted file mode 100644 index ab7cda854..000000000 --- a/examples/notebooks/intro/dpk_intro_1_python.ipynb +++ /dev/null @@ -1,3667 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", - "metadata": { - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" - }, - "source": [ - "# Data Prep Kit Demo 1 - Python version\n", - "\n", - "This notebook will introduce DPK and showcase some of it's capabilities.\n", - "\n", - "Here is the workflow\n", - "\n", - "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n" - ] - }, - { - "cell_type": "markdown", - "id": "b15976e3", - "metadata": { - "id": "b15976e3" - }, - "source": [ - "## How to run this notebook\n", - "\n", - "Two options:\n", - "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_python.ipynb)\n", - "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", - "\n", - "The notebook will work as in both environments" - ] - }, - { - "cell_type": "markdown", - "id": "eb8b0d5c", - "metadata": { - "id": "eb8b0d5c" - }, - "source": [ - "## Step-1: Inspect the Data\n", - "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", - "\n", - "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, - "source": [ - "## Step-2: Figure out Runtime Environment\n", - "\n", - "### 2.1 - Determine runtime\n", - "\n", - "Determine if we are running on Google colab or local python environment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1fe354b7", - "outputId": "5c153f72-08ed-4d6e-ccc7-dae851e7fd8b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "8e7c104b", - "metadata": { - "id": "8e7c104b" - }, - "source": [ - "### 2.2 -Download Data if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3309799e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3309799e", - "outputId": "99530315-6dd5-405d-dbde-61e2332e441b" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 2.3 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1fcec577", - "outputId": "0f77fc39-ffeb-48da-ce6f-1750d8d3ad62" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit==0.2.1 \\\n", - " data-prep-toolkit-transforms==0.2.1 \\\n", - " deepsearch-toolkit\n" - ] - }, - { - "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, - "source": [ - "### 2.4 - Restart Runtime\n", - "\n", - "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", - "\n", - "You do this by going to **`Runtime --> Restart Session`**\n", - "\n", - "Then you can continue to the next step (no need to re-run the notebook)" - ] - }, - { - "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration" - ] - }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, - "source": [ - "### 2.1 - Basic Config" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e4YMZrBuFycl", - "outputId": "d7ee9449-4f21-4c9a-fa54-14b7f28d764a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "33345487", - "metadata": { - "id": "33345487" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", - "\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b15e6827", - "metadata": { - "id": "b15e6827" - }, - "outputs": [], - "source": [ - "## Add parent dir to path\n", - "import os,sys\n", - "\n", - "this_dir = os.path.abspath('')\n", - "parent_dir = os.path.dirname(this_dir)\n", - "sys.path.append (os.path.abspath (parent_dir))" - ] - }, - { - "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, - "source": [ - "### 2.2 - Setup input/outpur directories" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "4d5511fb-1c6f-47df-e5ea-2c1b354d262f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Cleared output directory\n" - ] - } - ], - "source": [ - "import os, sys\n", - "import shutil\n", - "\n", - "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", - " raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", - "\n", - "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", - "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", - "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", - "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_embeddings_out')\n", - "\n", - "## clear output folder\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", - "\n", - "print (\"✅ Cleared output directory\")" - ] - }, - { - "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, - "source": [ - "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", - "\n", - "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", - "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", - "metadata": { - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" - }, - "source": [ - "### 3.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "outputId": "c50847d4-f2c7-4559-f5f7-d6a3d025027d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" - ] - } - ], - "source": [ - "STAGE = 1\n", - "\n", - "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = output_parquet_dir\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 3.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 657, - "referenced_widgets": [ - "97b603697cfa4b4ea4e6735b6768ca35", - "e87e8d3262c54cfaaa8768505edacda3", - "b78aa40816e44f7fbebcb24ca68818b3", - "7053c9606a414e978636a7e241909504", - "da0787b239764847a731083997780a85", - "553f3c16839a49d79591d0fc4862bed6", - "c0eb5bc8f6ee427ca42204b3c56f9a4e", - "9d184ed175f0403fb03c2e13dfd04e0a", - "724778729161445c98b187031ae4f67c", - "1cb3bbf7d724411cbe9831543a4aecc0", - "06f9b33494984e4885d5aad813d1d2bc" - ] - }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "01d207fb-983d-40b2-e5f6-e38e3789110a" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:39 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", - "13:34:39 INFO - pipeline id pipeline_id\n", - "13:34:39 INFO - code location None\n", - "13:34:39 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", - "13:34:39 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:39 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "13:34:39 INFO - orchestrator pdf2parquet started at 2024-10-18 13:34:39\n", - "13:34:39 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", - "13:34:39 INFO - Initializing models\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "750f3b6951094b2eb68490c7f5f98148", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 10 files: 0%| | 0/10 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0mars.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...10116e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf
1earth.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...1011efbdbcb9-f0af-42f0-b191-2f14ce3ddc7cpdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdf
\n", - "" - ], - "text/plain": [ - " filename contents num_pages \\\n", - "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "\n", - " num_tables num_doc_elements document_id ext \\\n", - "0 0 11 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 pdf \n", - "1 0 11 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:43.410297 0.794765 earth.pdf " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(5)\n", - "\n", - "## To display certain columns\n", - "#parquet_df[['column1', 'column2', 'column3']].head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, - "source": [ - "\n", - "### 3.4 - Understand the output\n", - "\n", - "Here are some interesting attributes to note:\n", - "\n", - "- **filename** : original filename\n", - "- **contents** : text\n", - "- **document_id**: unique id (UUID) assignd to this document\n", - "- **hash** : hash of document\n", - "- **pdf_convert_time** : time to convert this pdf in seconds\n", - "\n", - "Let's inspect the **contents** column. See how the text is being divided up!" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f870e624", - "outputId": "0b4c054f-3a8a-4db3-f32f-17bd1466b102" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", - " 'filename': 'mars.pdf',\n", - " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.35137939,\n", - " 654.45184326,\n", - " 169.88169861,\n", - " 667.98492432],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.09541321,\n", - " 630.68127441,\n", - " 210.66503906,\n", - " 642.34405518],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.84518433,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.02520752],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.18510437,\n", - " 570.83258057,\n", - " 374.99838257,\n", - " 581.07043457],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about the Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.22866821,\n", - " 542.98168945,\n", - " 163.86282349,\n", - " 554.45288086],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87440491,\n", - " 500.84011841,\n", - " 477.48345947,\n", - " 534.55810547],\n", - " 'page': 1,\n", - " 'span': [0, 196]}],\n", - " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", - " 'desert world with a thin atmosphere composed '\n", - " 'primarily of carbon dioxide. Its reddish hue comes '\n", - " 'from iron oxide, or rust, prevalent on its surface.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.2026062,\n", - " 482.90710449,\n", - " 237.04431152,\n", - " 493.07443237],\n", - " 'page': 1,\n", - " 'span': [0, 23]}],\n", - " 'text': 'Basic facts about Mars:',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 453.019104,\n", - " 477.48171997,\n", - " 474.9703064],\n", - " 'page': 1,\n", - " 'span': [0, 78]}],\n", - " 'text': '· Distance from the Sun: Average of 228 million '\n", - " 'kilometers (142 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.79351807,\n", - " 431.73287964,\n", - " 451.2142334],\n", - " 'page': 1,\n", - " 'span': [0, 64]}],\n", - " 'text': '· Rotation Period: 24.6 hours (one Martian day - '\n", - " 'called a \"sol\")',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 429.10913086,\n", - " 365.9559021,\n", - " 438.83737183],\n", - " 'page': 1,\n", - " 'span': [0, 44]}],\n", - " 'text': '· Moons: Two small moons, Phobos and Deimos.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.51646423],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "import pprint\n", - "import json\n", - "\n", - "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", - "# json.loads(output_df.iloc[0, ]['contents'])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e1a10c2d", - "outputId": "c1d992c2-faa8-40cd-c375-857970201daa" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", - " 'filename': 'earth.pdf',\n", - " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.30961609,\n", - " 654.45184326,\n", - " 174.04208374,\n", - " 667.93347168],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.12528992,\n", - " 630.69073486,\n", - " 210.66503906,\n", - " 642.27935791],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87112427,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.04595947],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.20942688,\n", - " 570.81555176,\n", - " 375.57919312,\n", - " 581.08459473],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about our Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.15542603,\n", - " 542.98168945,\n", - " 167.32983398,\n", - " 554.36669922],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.91053772,\n", - " 512.46295166,\n", - " 477.84887695,\n", - " 534.48431396],\n", - " 'page': 1,\n", - " 'span': [0, 107]}],\n", - " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", - " 'planet. Earth is the only place we know of with life.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.30151367,\n", - " 494.86206055,\n", - " 240.17156982,\n", - " 505.07229614],\n", - " 'page': 1,\n", - " 'span': [0, 24]}],\n", - " 'text': 'Basic facts about Earth:',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 464.97409058,\n", - " 477.47979736,\n", - " 487.02810669],\n", - " 'page': 1,\n", - " 'span': [0, 79]}],\n", - " 'text': '· Distance from the Sun: Average of 149.6 million '\n", - " 'kilometers (93 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 452.86901855,\n", - " 317.90722656,\n", - " 463.24041748],\n", - " 'page': 1,\n", - " 'span': [0, 37]}],\n", - " 'text': '· Rotation Period: 24 hours (one day)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.71496582,\n", - " 396.66357422,\n", - " 451.19915771],\n", - " 'page': 1,\n", - " 'span': [0, 52]}],\n", - " 'text': '· Moons: One moon, called Luna or simply \"the Moon\".',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.53633118],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" - ] - } - ], - "source": [ - "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" - ] - }, - { - "cell_type": "markdown", - "id": "72274586", - "metadata": { - "id": "72274586" - }, - "source": [ - "## Step-4: Doc chunks\n", - "\n", - "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", - "\n", - "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", - "\n", - "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", - "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", - "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", - "which provides the required JSON structure." - ] - }, - { - "cell_type": "markdown", - "id": "96198fa6", - "metadata": { - "id": "96198fa6" - }, - "source": [ - "### 4.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "305f00a3", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "305f00a3", - "outputId": "dd511f34-bab3-4dde-d938-493debb02e5e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" - ] - } - ], - "source": [ - "STAGE = 2\n", - "\n", - "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_chunk_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "369f2cd1", - "metadata": { - "id": "369f2cd1" - }, - "source": [ - "### 4.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5b7b18d5", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5b7b18d5", - "outputId": "e0b87171-9d66-473f-e66a-e4b6ae3c3f66" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator doc_chunk started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", - "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:45 INFO - done flushing in 0.0 sec\n", - "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:2 completed successfully\n", - "CPU times: user 826 ms, sys: 101 ms, total: 928 ms\n", - "Wall time: 923 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # doc_chunk arguments\n", - " # ...\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = PythonTransformLauncher(DocChunkPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "213afdf6", - "metadata": { - "id": "213afdf6" - }, - "source": [ - "### 4.3 - Inspect Generated output\n", - "\n", - "We would see documents are split into many chunks" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d8138d43", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 897 - }, - "id": "d8138d43", - "outputId": "fd01e0cb-899e-4c73-d50e-5f4e6f5ff802" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Files processed : 2\n", - "Chunks created : 8\n", - "Input data dimensions (rows x columns)= (2, 12)\n", - "Output data dimensions (rows x columns)= (8, 16)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\n· Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\n· Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "7 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "7 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (f\"Files processed : {input_df.shape[0]:,}\")\n", - "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "9e9ca75c", - "metadata": { - "id": "9e9ca75c" - }, - "source": [ - "### 4.4 - Understanding the Output\n", - "\n", - "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", - "\n", - "See how **document_id** is carried throughout. This helps us identify original documents.\n", - "\n", - "Also note **contents** is now plain text (not JSON as before)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3090c950", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "3090c950", - "outputId": "0f4b6771-8d38-4a27-c756-21f916b23a4f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfSolar System\\nFor more details about the Solar...
2mars.pdfMars\\nMars, the fourth planet from the Sun, is...
3mars.pdfBasic facts about Mars:\\n· Distance from the S...
4earth.pdfSolar System\\nOur solar system is a vast and f...
5earth.pdfSolar System\\nFor more details about our Solar...
6earth.pdfEarth\\nEarth is the third planet from the Sun....
7earth.pdfEarth\\nBasic facts about Earth:\\n· Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Solar System\\nFor more details about the Solar...\n", - "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "3 mars.pdf Basic facts about Mars:\\n· Distance from the S...\n", - "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "5 earth.pdf Solar System\\nFor more details about our Solar...\n", - "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "7 earth.pdf Earth\\nBasic facts about Earth:\\n· Distance fr..." - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d5f151ae", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d5f151ae", - "outputId": "a4c491b2-53db-4d71-da24-4479de8d1d65" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 3------\n", - "Basic facts about Mars:\n", - "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "· Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "· Rotation Period: 24 hours (one day)\n", - "· Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "7ad1c60d", - "metadata": { - "id": "7ad1c60d" - }, - "source": [ - "## Step-5: DOC ID generation of Chunks\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", - "\n", - "**This is a pre-requisite for fuzzy dedup** in the pipeline." - ] - }, - { - "cell_type": "markdown", - "id": "1afaa0fd", - "metadata": { - "id": "1afaa0fd" - }, - "source": [ - "### 5.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "6ffd6f54", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6ffd6f54", - "outputId": "1784c80d-6309-4913-9f55-c018b978968f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" - ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE = 3\n", - "\n", - "input_folder = output_chunk_dir\n", - "output_folder = output_docid_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "f78a51b7", - "metadata": { - "id": "f78a51b7" - }, - "source": [ - "### 5.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "5fc77557", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5fc77557", - "outputId": "db2b8670-543e-4073-9c7d-3f9ef5f4317e" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator doc_id started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", - "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:45 INFO - done flushing in 0.0 sec\n", - "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:3 completed successfully\n", - "CPU times: user 12.8 ms, sys: 3.7 ms, total: 16.5 ms\n", - "Wall time: 13.1 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " # doc id configuration\n", - " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"chunk_hash\",\n", - " \"doc_id_int_column\": \"chunk_id\",\n", - "}\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "a9a8c1fa", - "metadata": { - "id": "a9a8c1fa" - }, - "source": [ - "### 5.3 - Inspect Generated output\n", - "\n", - "You will notice we have two extra columns\n", - "\n", - "- **hash_column**\n", - "- **int_id_column**\n", - "\n", - "But still the same number or rows as before" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "da9adede", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 860 - }, - "id": "da9adede", - "outputId": "036db4ca-12f6-4b3e-9d7f-fa70e494870d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 16)\n", - "Output data dimensions (rows x columns)= (8, 18)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6
3mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\n· Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2
7earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\n· Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "7 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "7 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53", - "metadata": { - "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53" - }, - "source": [ - "## Step-6: Exact Dedup\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", - "metadata": { - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" - }, - "source": [ - "### 6.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "4c7a1b94", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4c7a1b94", - "outputId": "2f6f05bc-f6fd-4d66-ea01-ed89cd5b80f3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" - ] - } - ], - "source": [ - "STAGE = 4\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_exact_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", - "metadata": { - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" - }, - "source": [ - "### 6.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "outputId": "74dc0b75-58b5-4c97-9965-91315e8a98a5" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator ededup started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "13:34:45 INFO - Starting from the beginning\n", - "13:34:45 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:34:45 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:34:45 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:45 INFO - done flushing in 0.0 sec\n", - "13:34:45 INFO - Completed execution in 0.0 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:4 completed successfully\n", - "CPU times: user 17.6 ms, sys: 997 μs, total: 18.6 ms\n", - "Wall time: 15.2 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # ededup parameters\n", - " \"ededup_doc_column\": \"contents\",\n", - " \"ededup_doc_id_column\": \"chunk_hash\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": { - "id": "eaf1c3c3" - }, - "source": [ - "### 6.3 - Inspect Generated output" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "d824ebf6", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 815 - }, - "id": "d824ebf6", - "outputId": "68f55770-c750-4607-a205-ba183603019d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (7, 19)\n", - "Input chunks before exact dedupe : 8\n", - "Output chunks after exact dedupe : 7\n", - "Duplicate chunks removed : 1\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[]
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\n· Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[]
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[]
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[]
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[]
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\n· Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[]
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "82cc9bb0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 269 - }, - "id": "82cc9bb0", - "outputId": "46d9e91d-c470-4e3e-e5c8-508c534dbceb" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nFor more details about the Solar...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\n· Distance from the S...
3earth.pdfSolar System\\nOur solar system is a vast and f...
4earth.pdfSolar System\\nFor more details about our Solar...
5earth.pdfEarth\\nEarth is the third planet from the Sun....
6earth.pdfEarth\\nBasic facts about Earth:\\n· Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nFor more details about the Solar...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\n· Distance from the S...\n", - "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "4 earth.pdf Solar System\\nFor more details about our Solar...\n", - "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "6 earth.pdf Earth\\nBasic facts about Earth:\\n· Distance fr..." - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "cc61dffa", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cc61dffa", - "outputId": "7fb26043-8538-48b6-80b7-16ceb818c1a8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "· Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "· Rotation Period: 24 hours (one day)\n", - "· Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "383f40ba", - "metadata": { - "id": "383f40ba" - }, - "source": [ - "### 6.4 - Understanding the output\n", - "\n", - "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", - "\n", - "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", - "\n", - "```text\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", - "metadata": { - "id": "85309751-8556-41c6-ac32-84acc941bc8d" - }, - "source": [ - " ## Step-7: Fuzzy Dedup\n", - "\n", - "And fuzzy dedupe is only available in RAY version. So we will skip it here\n", - "\n", - "See this file [dpk_intro_1_ray.ipynb](dpk_intro_1_ray.ipynb)" - ] - }, - { - "cell_type": "markdown", - "id": "5370950a-2a3a-4143-8218-f9b4808099ba", - "metadata": { - "id": "5370950a-2a3a-4143-8218-f9b4808099ba" - }, - "source": [ - "## Step-8: Text encoding\n", - "\n", - "Encode text for the vector storage." - ] - }, - { - "cell_type": "markdown", - "id": "85aba685", - "metadata": { - "id": "85aba685" - }, - "source": [ - "### 8.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "outputId": "41d268f5-7cc6-432e-d56e-2ba882fbdba6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-6: Processing input='output/04_exact_dedupe_out' --> output='output/05_embeddings_out'\n" - ] - } - ], - "source": [ - "STAGE = 6\n", - "\n", - "input_folder = output_exact_dedupe_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_embeddings_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "c97545f4", - "metadata": { - "id": "c97545f4" - }, - "source": [ - "### 8.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "outputId": "b2119b07-0654-45cd-f729-1396e18b24b1" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:34:45 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "13:34:45 INFO - pipeline id pipeline_id\n", - "13:34:45 INFO - code location None\n", - "13:34:45 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_embeddings_out\n", - "13:34:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:34:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:34:45 INFO - orchestrator text_encoder started at 2024-10-18 13:34:45\n", - "13:34:45 INFO - Number of files is 2, source profile {'max_file_size': 0.010450363159179688, 'min_file_size': 0.010318756103515625, 'total_file_size': 0.020769119262695312}\n", - "13:34:47 INFO - Completed 1 files (50.0%) in 0.004 min\n", - "13:34:47 INFO - Completed 2 files (100.0%) in 0.005 min\n", - "13:34:47 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:34:47 INFO - done flushing in 0.0 sec\n", - "13:34:47 INFO - Completed execution in 0.034 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:6 completed successfully\n", - "CPU times: user 615 ms, sys: 146 ms, total: 761 ms\n", - "Wall time: 2.24 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from text_encoder_local_python import TextEncoderPythonTransformConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "params = {\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # text_encoder\n", - " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", - "}\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "# create launcher\n", - "launcher = PythonTransformLauncher(TextEncoderPythonTransformConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "b734852c", - "metadata": { - "id": "b734852c" - }, - "source": [ - "### 8.3 - Inspect Generated output\n", - "\n", - "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "7b1c1d09", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 760 - }, - "id": "7b1c1d09", - "outputId": "018daa18-e5db-4483-d8d5-30aded80d5e3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (7, 19)\n", - "Output data dimensions (rows x columns)= (7, 20)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremovedembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Solar System\\nFor more details about the Solar...$.main-text[3]1[133.18510437, 570.83258057, 374.99838257, 581...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...5[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...[-0.051861435, 0.0035226212, 0.030617002, 0.04...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6[][0.07728295, 0.024970993, -0.043180738, 0.0580...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:34:44.2595450.845978mars.pdf6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2Basic facts about Mars:\\n· Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7[][0.10598018, 0.025460618, 0.023627337, 0.03905...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nOur solar system is a vast and f...$.main-text[2]1[132.87112427, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...0[][0.0077404436, -0.02055944, 0.026426593, 0.011...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cSolar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...1[][-0.062105548, -0.0053322907, 0.031277698, 0.0...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2[][0.072435796, -0.058001805, -0.019771898, -0.0...
6earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:34:43.4102970.794765earth.pdfefbdbcb9-f0af-42f0-b191-2f14ce3ddc7cEarth\\nBasic facts about Earth:\\n· Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3[][0.091821924, 0.015197902, 0.07716932, 0.01711...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "1 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "2 2024-10-18T13:34:44.259545 0.845978 mars.pdf \n", - "3 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "4 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "5 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "6 2024-10-18T13:34:43.410297 0.794765 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "1 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "2 6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 \n", - "3 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "4 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "5 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "6 efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", - "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", - "\n", - " removed \\\n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] \n", - "\n", - " embeddings \n", - "0 [-0.051861435, 0.0035226212, 0.030617002, 0.04... \n", - "1 [0.07728295, 0.024970993, -0.043180738, 0.0580... \n", - "2 [0.10598018, 0.025460618, 0.023627337, 0.03905... \n", - "3 [0.0077404436, -0.02055944, 0.026426593, 0.011... \n", - "4 [-0.062105548, -0.0053322907, 0.031277698, 0.0... \n", - "5 [0.072435796, -0.058001805, -0.019771898, -0.0... \n", - "6 [0.091821924, 0.015197902, 0.07716932, 0.01711... " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "31f09b58-7b2d-48bb-9dac-bc0ba9625c01" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Copied output from 'output/05_embeddings_out' --> 'output/output_final'\n" - ] - } - ], - "source": [ - "import shutil\n", - "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", - "\n", - "print (f\"✅ Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "dpk-2-basic-021-py311", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "06f9b33494984e4885d5aad813d1d2bc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1cb3bbf7d724411cbe9831543a4aecc0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "553f3c16839a49d79591d0fc4862bed6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7053c9606a414e978636a7e241909504": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1cb3bbf7d724411cbe9831543a4aecc0", - "placeholder": "​", - "style": "IPY_MODEL_06f9b33494984e4885d5aad813d1d2bc", - "value": " 10/10 [00:00<00:00, 349.38it/s]" - } - }, - "724778729161445c98b187031ae4f67c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "97b603697cfa4b4ea4e6735b6768ca35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e87e8d3262c54cfaaa8768505edacda3", - "IPY_MODEL_b78aa40816e44f7fbebcb24ca68818b3", - "IPY_MODEL_7053c9606a414e978636a7e241909504" - ], - "layout": "IPY_MODEL_da0787b239764847a731083997780a85" - } - }, - "9d184ed175f0403fb03c2e13dfd04e0a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b78aa40816e44f7fbebcb24ca68818b3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9d184ed175f0403fb03c2e13dfd04e0a", - "max": 10, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_724778729161445c98b187031ae4f67c", - "value": 10 - } - }, - "c0eb5bc8f6ee427ca42204b3c56f9a4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "da0787b239764847a731083997780a85": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e87e8d3262c54cfaaa8768505edacda3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_553f3c16839a49d79591d0fc4862bed6", - "placeholder": "​", - "style": "IPY_MODEL_c0eb5bc8f6ee427ca42204b3c56f9a4e", - "value": "Fetching 10 files: 100%" - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/notebooks/intro/.gitignore b/examples/notebooks/pdf-processing-1/.gitignore similarity index 100% rename from examples/notebooks/intro/.gitignore rename to examples/notebooks/pdf-processing-1/.gitignore diff --git a/examples/notebooks/intro/README.md b/examples/notebooks/pdf-processing-1/README.md similarity index 66% rename from examples/notebooks/intro/README.md rename to examples/notebooks/pdf-processing-1/README.md index 77a80865b..a611183e9 100644 --- a/examples/notebooks/intro/README.md +++ b/examples/notebooks/pdf-processing-1/README.md @@ -1,6 +1,6 @@ -# Data Prep Kit Introduction +# PDF Processing with Data Prep Kit -This is an example featuring some of the features of data prep kit. +Show cases Data Prep Kit capabilities of processing PDFs ## Running the code @@ -14,9 +14,7 @@ conda create -n data-prep-kit -y python=3.11 conda activate data-prep-kit # install the following in 'data-prep-kit' environment -pip3 install data-prep-toolkit==0.2.1 -pip3 install data-prep-toolkit-transforms==0.2.1 -pip3 install data-prep-toolkit-transforms-ray==0.2.1 +pip3 install 'data-prep-toolkit-transforms[ray,all]==1.0.0a4' pip3 install jupyterlab ipykernel ipywidgets ## install custom kernel @@ -31,6 +29,19 @@ jupyter lab This notebook will demonstrate processing PDFs -`PDFs ---> text ---> chunks ---> exact dedupe ---> fuzzy dedupe ---> embeddings` +`PDFs ---> text ---> compute hash ---> dedupe ---> document quality` [python version](dpk_intro_1_python.ipynb)   |   [ray version](dpk_intro_1_ray.ipynb) + + +## Creating Input PDFs (Optional) + +```bash +cd input/solar-system + +pandoc earth.md -o earth.pdf +pandoc earth2.md -o earth2.pdf +pandoc mars.md -o mars.pdf +pandoc spam.md -o spam.pdf +pandoc lorem.md -o lorem.pdf +``` \ No newline at end of file diff --git a/examples/notebooks/intro/my_utils.py b/examples/notebooks/pdf-processing-1/archived/my_utils.py similarity index 100% rename from examples/notebooks/intro/my_utils.py rename to examples/notebooks/pdf-processing-1/archived/my_utils.py diff --git a/examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw similarity index 100% rename from examples/notebooks/intro/images/data-prep-kit-3-workflow.excalidraw rename to examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw diff --git a/examples/notebooks/intro/images/data-prep-kit-3-workflow.png b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png similarity index 100% rename from examples/notebooks/intro/images/data-prep-kit-3-workflow.png rename to examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png diff --git a/examples/notebooks/pdf-processing-1/input/earth-copy.pdf b/examples/notebooks/pdf-processing-1/input/earth-copy.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9a775a9984c7570f96ba5326cf63b6eb5039e76a GIT binary patch literal 58535 zcmagELy#~G%%$75ZQHhO+qP}ne%rQf+qP}nJ>R`E|7sSKMe1ae-BXpEI;09BVzi8O ztWc!$%OmShtON`M_C{7vJUmeJGNyLsE*1pr9IOQYyP)XBENxs&oe1c~Yz$pYMNEzD zO-!Nq_@JC!oJ7=0*Eim*JRc6VDuSq~R7s;wO81o(JFTLm)>I^P zY1T&6A4lPqTEAO;;w|K2S0QJjLKB(~kQ~_yT7)|iTFBVrkB+&P6J-s{D@0ON$}I)r zQNJh9*HR?w7XoADzbpconaqq8MT{k1Ny&+$(D?bKbP>7bJ{rOMu})%f)qC5L@9ypg zf3`RLT;`4{h^ppq!1#fFA@}cEODrVVTCHvGN*RyrGgSDc3?rf)QL~vG@gS2^R$#>% z{U}6buj>JAE#qt{B|qO=@3s+MiF@4(bozDA%^^ZS=KOf`0&kOl31vnk(p&8W98a{+ zky$*q4rkn(GyT?fV^KQr5&xCWhbhy6Jo<5x3s4=F4Z~%3@#&I_JJ=1NNbTb^9GN^F z35Rr6TWq22F)Jix`Nq>=q$X5@)>R||X`gr6kU(z8p@&sBx+TBZzFHW|z(RgO&Jni2 zc%*;9-@?%%oBpfm-rU?F1d{t(USPGwdeT12b8lEMA6Y9b3_Bw!Bsz8AiR6d!(Im*g<*q9xdh?Aa z3(tOr-Q^n&%*Ji?J94sg)_gK+Z!FR;I2^4lc(T0`jt}?V8(nZ6ia66Q%}6ZkPcEFMsmgTHB$Ni|L}dBcTWkvWUPj`p zDEMRKl?)|(mrgg(B|B7zpR4!`hOdyhnp!uAf?$2y^BWJ1K@9cFRmEPx`{Q1Vmw-ky zBYU+po1dYd%X3SZLVh!Z|JmH)fq!bf-+?0|9wS;eG~k<+#ffWG_XAu-+E#`#wKMrYZ2YJGKdzXW{x8572^iTq*#8GNCITh~7Eb2> z8JP(f{znopF|x67{{JNYzd>wnGgZ;v*<_8ec6`ep+4-+{IY8VgZfpP3lyZ&~bR#*3 z3KX^YJ#)looJT)ju`wp*dR%WhYg!uxB&ur02#nwvK%|0jwJ)7=1`psEQyk-*T@ z!4%`OIQ^63)6;PRl9Z<~ERHQ;oDo^=K%Rg)wQ2yQ%xm!?<^a^p%@0#N4wF24?$ zz%o4mPy*`6uJqyr&;gsr`1vqSj;0K(jvo%dxHL7m0WA2GMg-#Y=GM~GYkjQi7#P25 zpINk%5DBb|O`x9b>sr7vIMaZQF;Fo8q6uT+3s?fMP5|jZH8R&U0RL^|QoznHASR8T82{HCZrJw1nT`lY8cvbw}ldIoSr6m^9JP)NojC@CoYeN)CgzaQ93!pkrJ zJ^0%?Bfhuq3<~Lt=!|OwfnM742B-(v4xFW1_22phXRm}e3-~K}O|&|IY5Xe#4A`7p zSxNjCIWaU8ys$Vkl8Sh2GjVS52G8$GZH*oPKE8o=@8}A)3HXONIy3MsET>Mr-v{-r zixkk^f;m3``pzzi^h{s&6P|)EiSM@dNBBw@ak>pUWZv#rzpC!6)>y=6uKj&6;v{?A_(1c;{kCcuBA>sy!EI^5p|iYSUH3CZdb z<`F;mtUn*%Q(-J{VP)wh@4W9tWcb7raCdql@4>Ret+Z zm|Q)f34O_5rlkgWd}!c|zXoj;?i9^mmEGOhWYMAC%?Wri;!piL9P+^5HcdbsfEnN% z1i&j(lkvy)Wv=*}wdtGHUt<642*L?OBYnLC;HR2403RI$A0649pnqip_WbPWF77WH zL8Beubk(dDU--8{5#hIj?#KWT=>B`^d*HZF_pjx9`OiivGalCp&Z+(hC>@v{a0MkNVtSEMr}J%kyWi{SRUR=(j}b=TD;5EwT9(B$HaZ zD^v4Fdc}XkR7aLCOmBE#5$?T#=r5h+_m;RTKJbUbrTQ;d695d2F7EH%ce*g626%o1 zw146w9{Rmt!2kXS;g?<;Ah|jyD6X1i$3JF@Ul0k-jKJ9%TpEBHoV)n=opm{0}}%#BVS5SW?;-aj}n@-mPfJ9YPWocVKg{b}CkLv!ta z)CUd@0PzfxNmxx2IQ=eUv#32p^-UUOMpZ!Ri7{8%0NP@TT%g8e-U;C_^5i}dAT$zq z1E{GoLf~ns2mH4z5Uyl9i9plUHe*l8y2O4;HRw(D3-t>!jzom)nHhs6s0lR%n96Iy zGS=uB%4K@(tce?mI}8vTiOiT=yE~?_TH)S&#lrM68TSHY5uXH_#wKIV?J{So@@*G% zl40kt_9j04j*)yvMBV%}8fv(X+{h63(^JkR;y;c&A*p8NR|fz2WA}EGK?G5Z^j0^* zbazK@-f2V1GFWM;QaT@`n!YKgI;X99yXf{c_}09NRGLL@Up1~8-Jj}^PCCVZ9vE5t z23Nea0Zx1bueTmo7yzjhwXce+=h~;Uprup8zhW;y3_-`crn6g`=0M_M$*tzUmKToy z8pQDVBjzX?*%9i5=>}FVUHbTgYjikD>(-<`LvL&Ei~*mKfcz!KP(fK>Ol%u&`qoa7 zTwtT9b|uUf;@(QWGbPAuHZH(}C?o&iM2&xl;%sN6huqI7nT>NJ$U4BeFxAU>+8|Km zha%~j*6$X$!T8<2T!^vhKIlAQE!e#`3lOC`gNVOkfx5F@*AP-f0Y0GcbrVYN3;C7v z%o=;BYfUe~0?!;6zs3KQRd#3fK#4z%PpQ{LCWOO19?qDS)S4!^SEHK_a3-xv0tahm zOkzcW8`}Qu?hONxCda{q|AFOBOP?h1(p{X`i|Ow|(ZNq$Ml|Nf07V=q49+hocFEfU zdn^n@gyxe+q0Euuaf%-8=FvNSLyVbOeld@>ll{-!B{kTBBlkx=7Im}smd(&s4MCcrdUhxPNS>tSLh~dL!?Mv~5g|0Jxcpu`YF6fsjCfTz;)&%( z(Q9+=nP)bXM^x+lBfl!COL2?1jQ4S0zsXWf=(($+Xk4fQNq>V1^H|#O`J(ycxa@Fk zB+Z~=hyyJ$Jp_gAJ<;DOw@m_PKBnk|%$u6Zv$@}S47KMJwZF6t#g6A_95LC3&x}WBgKC2 z)~>*?rT#VGK?U*^kjZAbL$ql^W&~I6nR&rt%|i2L+Is;LM;=&ZiANrezvAipEo*QQ zRss;4L|bk&>ln$;4~XuSCsl69O|3TZ5W>zUyC-vpKd0qd!F73ghKJ;qVf`)w?a*n8FkKlEXK~j?>R~<VG4T&EHaM|%8~Z&WDfB$l`|!Fg55Mc?Xt)A>E@-mQ*}Pvz zEo2Mc)_aT>@_bnbEHMXK4L)dFAtd&4vHTDy>VUkc^>1BEbrfU97N4_bCPOS|U}HJ< zCk@{ieV4MX?pD9f@(IR`(QK;tI{N6##X>YN2*>$5H=8=#9;`W`rqPZ3e)d)JrnF*C zN=yW*6K{O#@~gC*6N6i@_1k-UHs($}J8rB(0HiXX;>p3!<>{$xrV&(9P6e!@GQ?hH z#Ked)NH+Y}4Y=Vf2j+$sFL$CfPM$jkwky)-)-4nlSRU&Ww`vX9-8OWcenM&f@|jSh zP?yA7#o9YjxoM(H(L;Fy_msX&DVUl#k%MRp^mvZp53V)kTq9aIyYEQxKsahA*k<}UuIP4*1G_@7fR z9a`@~d+_&JB>A(?a$3;Lr6JG8D4XXJC+UaJ`wtM4H*NZ#vLY{LohCSyk#SS#{cfm+%fENnyh7DY(^da>ivNHw%H^G4n8O2J1p77Tb9$Ck1$rfLEB9qJHANj zQ2He)vTwJUVgwRQ$e(C?8h)q-Hl)g97wS9DIti|c;s6@e62dAb700D7uE(zJLzpRy z7?zFAZRe`~xMMk1o}zh0kWjP>8$YnA3bkxo7!M&H)&az_m8j&_C4{>3on5F`Dw^W) zwyX&zbFvp*(SMl*6bthSm;{l<@6D4|9gfceS8GC@TA7o|f`RVJnggJf$@%uSNya^X zcE=)&lxJ6OR*oysC7m!vU^8|oIkYfh-CCO+_kieY`oH&b<$rWFkqMqLo-}?$7^cN+ zS`&@f&=IL!*j!{Pj;ncA3}_Z~l)H>C6*AhZ-Fl8f$9~X{k_OyKw+SA4TCn#b-#}aW z>KJ(I{=$@DBexJ2BpH;-3+4coR@C@tdo1npHO27jX?;6QN;&!eX4v|^7g>8BYrwEC zBKp`Yp=2AkUu4dL0vWv(7de{z<0_~w4HFgE0)Yy3t4M-p)gQ>W1sRZInIU24Y1P{t zzII3x0JCx-q(jempmKU~-%p@L`J7C8=3)--;|*c0Mm!HD)|Dzcmcf5fL>KM9F4H8a z`0)pAj){z|D0a2ew)<}xt-f7Ohn_r79OqXd+!MeQ-bi?5jw627fT)GcRWJ#n^OrKa zj>&7Sz(H^%!1KVPV!hs@NIx4v-O(6(Jw8cS!~*83X88q%<_gDCM`w52zM_;9i2&aG zCu&bwY-pUSBP^@r&n$~u*GU}wjb@xkGd_^EZxvf+`0E;w-X)G7(tAVachw*|MepZx zWu~5w?2iS@YYxUX-F8JF3cgLiRr5|%8X|j5wR?r9GNjQdU#yD#r3=K59xE+o^Mvu# z*_(W;6-(*`90S*&qTYLPsfrRi_L_!fad-Avl0gY2zU!%2b9(19;EB%$k#F6%dHmH% z_!NaY-eeK+sK}D*X}dw$S}mWhy#98yV5@!{QD)mCy@Z?bS$I#UmR&*nTBje%r|3FZK_E{p{%;% zf8+PEjA>>P^>hc8!!XgRzEI5r6=r&M)EgIK!F&6NbMH{ovOwaT{ECci*K~f=Ci;4EZtf;#^I`A^YA!r;?B?gm)E)I0Fvh zyEc<+2w)|P45rK$ov#u^%$d6&Dq4D%f)5P=@v=RP)%oi^`V(Srou^K`xIp9~%aMHM zTntU@Zz=Q3={nrMC^S;g6?zZaYd15UPt~T0ab}Z-NI_*?55SkjRi`5-E$PD`I#zBX zV(H7=yCM4;?>$ufM4@zyD9fP#U9a*~8c3N|Z~%c@m?~lQ&0i|x;G(dkKqq?UoSXk zV6OK7{e|*Ps^D3Zv%UQToX5A9FVa?BJF>n-zaujVX!bHSxlJklVv zh|X@d{C=6X7#WU3Yqn$ZCuKVjOwjV%3KnQ9v;k_%s5q4z6COjJqMG~(COdZJ=}s0x zB58Oo$9E>FB!`EGsc5U+$~k>sS)?Ou^1Q{t5nr8rm3WpJU$gTsNYr&{b zS8PBv0Alr&gcjaRvGDn}wR)>yHaJktkeC^Z#FlIlMuo;;>Q4U77cBPp;qopy5zdea z*gUPl-5V#NGEq}s@(V$J+0Fj~{H-f6ROB9ejgNjt5!t;|%yZC*7Pnm9CRsw%Bd zTMPTPptZLSRJzcI>hh_J$mm?|-oKorXW*Sup-VK=gJ)3+Sel7ft=tDrM`5z7hMjCd zASz*HEVNfKBA#6b3nA=Tulle$=c#oqL+6={?;^aKsi4R~NhY1s$-eyJ$EDSV@*eED zLIF*(x#zJO%l@|0t%$>JplC&DIecBTaybpK6>jM>aPG*It;Ay}A5~K9&Y&^KBTe4hYYUdBK)3A&evNI{if2L=?XRn=sW( z&TJ1-$l0s={88LJ12gCLh{@!z(G2SqAi||{{&mAK`8{O5N>bdM=eYFg3D7Cw3qnty z+wJXj?8jgpE+O%bPThg7JRbN_wgvGCDxz!%ZCkocsr%r2bQk7ue@A;@J6Y>DNhnqn zceq!_YDsCp*|bc)Icf1v+K zeF!oBp^d22CFv#NiNH|u^D|vyPUT zlYO=&>C7fM3?9RoALHkXH4;1HJ|dr+>c&pOM2-Q^Q3`x3k3#uTMCK4opI3y+Sn%mI zy^UuiIBLtGJvL{@?biy%{&x<0#>vH)2Fh!qD>|eHv})|zdc~aB6gkCS2RNsPgcr!csc>u&`yM^zfM@14%X8;+ z`L6x)sGOdDr*@p;b)ei%y_M|Z)`g&NZ>bm0UoE9UM3h_fkc476H`NW}FccNurDr}T z!o+i|Q&5%(ic*G(fjrZo;dN)A;QvMr2t+bKNH@?!o?s9#eh(v9uCUhI!kIK8y| z6Bj9=VZEC{3UZOjSfB*Drwi1o{eQ@!@JLk`4;_B z-z>|z2PG_rQcA-;jpP<|D7U03CFkQ9*a*Wv!Eza1HA3~HzI<`mvX z_9tIEhr$_|MLX|xqAe7PP zW!PzcD`oR}=C6 z2%Rr%HxA@zkErUALYYji*K1@QzLEqMM90BTB@hROFe9%q35(Pm$}{YxVl?9&XG0i3 z>yI}wz3c;NNo*o7qn+I-m%P@(XNH`9qt?AScP9^Ie!6WgcPv9{Hx43!iRqcNd(kb1 z7V%7=5tRNuq-vmup!ekLXs2J1K^2%4&k%=g6pqXHR-!P?5>@5fHfkN27-tAAWsOn| z?nWrYf~-x_F#W+?q;Dzi(stW)xB$jluaa4Mhbi=hUV zxNZN=)}O!TF$j@bFmCXc!V= zAEnU7{1$ia1F8JxJa-$=xZt{FJCzwylCj`Lk~?(ZTxo9)g>CBAb{6Z!K?h?`g+~XB zjJwP>aq=k5r?rM2+J}^LvKjV#Icq8Ac^LsgaeT7mZntj{S>t@$Gn8h%S_MVWDEnHx zigiNr8_yWW+Z}!4)n)#L5|STI8`2T=9=-iKQHi3$Nnt9Q>8|7O&qk}hr%X{#A}>Upk+cxoI*^hqX*^eIY~?T3tEuJ6FuII5NUx+A?yX?&31C zqIJi=>Vgf}&~nryl|hZG+uk~(xt=nv*hS)okUUmdaPinljV67f z5^HG^YAXkF6D&B2;T9JL1iVLs=?5v-Wz?a{Gug8GjNIM+9A5}PRA12!Vg|) z;9Wo~t~=AfNS5p)WmKB=2Vz3#%G(u8Wa& zd;M#PxLZoW?cqSdN(Yiqz@%8vo|T3bRc(M$3lu3MoSwuMkrf6dq0~eD?c%wl86&p{ z%4K*nUJvJFDciJ?Wh}Q{Xum;6Z@qVD+Wv{*V^ghm(86MK1P%_eYZo9Nf={$K<(+RifPRbW5-Ds*cP7cb5usJqUVK<^d(gZ{6TjY&hqzh(T^e|KLsfRmb8q7>rQ!E<;*mEOCW+hK~!@eERn6qW{E z20hBG+`#H8gF#<7yM-EdC{I%Zi^coCl=BMC#qpv&H0+yEW3d-nMaA!nEtrbPYiRbi zRe5D!tqfJgdX?8Fdw!cUQehQOBw2j;9rW;mp|{L0JAS%d3Ww8-TG8!B9MUonTI6qX z!m9QpMnpCyf}1218HGJ29pkT?eM-BHxNMp~%9jyx$bXjyaS99OLQc7+ka2KgGU_xm zdk@d)1U#^;CG$#rLBhKpSk)b$ylw|QD0c`uY%*fki#OLUNJ=+$?GILs#ukMga*Az` z3UnS{4uyCqL+CJH5M)esi*&h%)Z2|bgN@eZVzY(-s3_)1Z< zXOWfnW4Br;C7uCRHUu6rz!~ojV`s-x%Uc!hID}E-S>ZbP>7=B!XrY$_tb)Z@{2q|s zsrYB(iPaRCaF|8q@e$crk1S_TSzI*yNm;_ol2=xAt5jSRJj$;H%0IN=nrEn$W+|?f z^kUZR?RT8!>3J6k`sz~I1Cy3I@_ut$f?UHK@b|Kj=+^K%dPc&}hz(%!_fcU@D@t}A zqr(aKDL{65Cv4M>^->5cAJMTHS}qivaSL*Y_-4c;M4>L@42oYlJ%%0So)AzKl8}Z} zic8~|+k{WHY^gadLo`rjh5lm~upV26wRb>y8&}83W&C|$#Y+uV#{6a~j%4CnVmvfc z__?)JY7QtHmje*07Lw9KB@Hna2KqBX8)Ef~ z!5HyfvC07mg3-&`zpXp*RAIDK@AHFTRZ_6&ve;2OEEYf8Q@^=0+c4=}i$eU2K)AMO zBPr8%jT{E26h79YC`i_OG<-d9Y`Z>qrajp2i4a$LdqTHa`zC$-FCqRvY1 z?$6ZK4md@+L~S`KG?)zfA-**C8gOHpP4U0@8|bUYoQL!ZW0}HmoKCUxq&i~ti-1ld z&`VWtp4M!{%`DxW^0k?O^TijNpIs0!cJ217dU|#B`X(>^Gq$^z-v-TVMjfPDIQ8JU zDnR6ODu+?1DO!{wqGA4KgH8YdBN?A_0h4p;i~I7sgeP#Of^1zIHhr2? za^lrV>7DLWQsC}VS963CoJ3h);}Zya_?~pu)D5jZ5<9rbaqn@vV*GMn?l$!MLnx=vB8H& zhnjI&>#fF&Y4!mG+ctp0EzE4Z%&}!z#N`94%<_6Fw232ES5nSORt+dQJo4VPav+i3 zunHBaaSFL!^ee_3$&nd4g+Mra{8jcovI13+4F{%I4V1&k*r6y%h;u?F*q!gEQX~pQ zJAfQSF0yO!LjTd~UA1S=&KJ%Ao)Pg=-u~VT)RKc1BcV&31s&Sjr)jq`Xd(NFRc{wI z9dvJd*E-=-lrXRF`xy7|WNo9Y+bVs*xjtCFD2t{=wWk5kzG-Qx))hV!>YK)#P{^T+oEK$UGW#$y5?`-Lz_EZ^LyQaE9u*ZE0Ec<5*GD+T;ffhp<0G4?;r8P}P0cvfZ3jQCb_ug5Mz?!%-osy$J8r ztC*x<=UtsjiAzY}P&nt1;OjWtK1P5(&j#eA*HHF=~9j6!=|S zjiUH!3U!2bjifQ-hS{55=-ZX)=?wL~5|>9P+`rh1 zi!DBHw;`Pa?1(klcEF_~Ji4SiY`u|&6EKW5{60NS1Wvxar$0FD$!2QIpo0Az5 z1Rq&FjzQ$)=cD-sgRyjeGIE1&b5mj01nF-}u$Nc}>LblFh|9$`8<&~13Hdq48#-XI zrG@KDnwnS!78zB5qo}UeSt&EW2c2E&pnsyKN;6#``xGYmj%-Odwwh%LE%&24p+Ls@ z6=KyeNo9)ooya{+z+5-w-3ENYY)W|@2>3;}CaNE&@3akeLc#Ik zX-JJA6AoHSK3$ZAZ>23&c_xn;7nLn0xvj$)^^3n3d>)zuf^p%=!J3LfV1Lrop7NoV zC8N78_(gq&p_N>6kj;{b$Lh)er~H*73pY<&${o&yxGuY9S8HTKeQkvcb1HF}hy}sa zU#YcZgglW^jIVS3FDw+Oo2#CL!3XI%Bv`M=*{lHXsoTCv0Qbgsmu|aBPtrMv(YKe( zE9UBc2>8(mKV?fpF@gaG}Aa48j4z|>9DO{2Yl8&jm}R<4YlJq8k^gqhm7 z7X>C^HAj4In>7EdPv&zAnt;uCc;(dEMd|^v0jt1v-5Wv@*X`*2sW!v&g}Qgxn!KEH zWYoBj!NjI!g7EzgiWyzSP&gi8ZA_T9zFI4RR)3}a2+I^C9k@`|HN-h#nTA8;@J7Tw zaoqe38_ordXdTce^{tUq%tsyF`zSJ)EcfGfC)oT6aV;=`KVB`DOo^8}DM(!8<`pWFiR02BBl%;oMwRFW&22FL! zTCMJETy(A2^{SpoRV z|4jF<{hIM2H{1$79p`B<1-*mB|C)b~vRCKEO&i`7EW{+V7X~zMD=s2dg>bBC&v!{j zuXi=6IcR$k0$HzA+^0j}jD7tS_hs{dF?5FGwyC9c$c$iVzSt$97q^<66ZBjlgY~K*H^`h_4I3W!$f(aQB#zbZ z2K>!gWpvYI)3OXP*C!zmO4AklDspXI+NA0pN02Rke&@3KH^06%%F<+NC9Pi@aq@}? z->RD$ym#a@GK3r=_L~uNdNvgdtU^jN!Np}j{_-S!)Q25+CzZUYp{6C{E;~3h0D8SZ zDb0kTu`PnZct4`Al}Ua32oz5Ad|kzTrZ7rHOG%u-1X|X0hkX9%>^dfo4Fp(n`XCC7cH+y$VV0wxhexQm(4#kt(<5! z`w_@Fv|Rm~4A7fyb0>Rd&}d*kN9A@CM_19FR0JZ__~RbusX*87 zajl$5F5f#88v`S{kensv=6fX-mu5-5ZAt$4now5p!yR(IQub+^0bB2&Rf7Rv)V#d@ z=xQN&F~%De82jWFm%qf@K<`-AXcJo@-VwqeNEc14%)q)x1})%8CI9=@<(r}7DCQ(8 zmqGeaK2BWX9hOi1FceOgPC3k3j2-ttTI`M&tOhp9J2`1Lg0Udr&5~%l$tumD7{6?l z93v$N-pLLX{hhyJ7@Y+;MJ07MyjBQ;;S%5J#8@N_P8v9Un1vu>!QNggO8Cs<`@J2p zk>;vDQN-;BHnrJt9Zj%>;Kh;ro-$XR$UL^$k)G_sK!&%_S|ole261D&8}p($Ah)!R zt&>y&uMf`!w-@&YFpbrKNkchXpK1{e>#6qw)XW(|@k#K{=+3wBSp2Z}17sU1{S zpSi)cW((0Qd!}REo9MJiWwF; zbe8Bj9b}sXQn`tbTGP*|96BhNdj+j29Zs0u$}Jc^)TPNw&Rlr7*-p=w;zKbueb>}Q zPi9j^uy(=S7jZnVTmJXzn@xYV{y@>wvt8-0%fv1Oh)3}FqlJ`wkkO*{^Al;Y$|@zF z(55URxH<&oc+E5ntirQKqpw6*?krXOl8iMzb;$_#?K4b2tU^GfsvVkB&)!4I`UQis zB^0K*C0eFEj^)XfiYRvL-CSIqkVh3S;I;k7;B#TIKwRfn3x|pcX@Wwl4vAroXI<+O zk!ng-sM*iok@t-j5=Z=dgCgXT?k<@F@odc;a!ded7$85n;R7`g;GR#Wql5xhP2Xk@5-oJ(ZNTg5)14_;9%NXH4NvyEba$v<2WG3pZy zXOqVpDR#IZ{~_uQva5?iV@?>MO&nEjq7m*CpQ{c*4xUPyy<JB3!N>e$ zbFVQ4a|Dqj^tH>bg^0*Tlgus(|7?B@~OHs4fzyyV2d zo$8HClB5|wx<0cjv;FHccn&~usm8ehGr79~!76zZSr>Ybi*(@9it`~-IT2R~q5Dfn z3S#*4#k#GT<+AxY=T>xW9-}6bZ+kMGSgyhLqW##Qv8FJnGLeK~Z78d!)(t~+bwcmiqAy#8%lD#|6%%9e9)0SO+zq26xl=21?!Dz9s*rgTbQCRj%N=!YZHU6-QdVNQ zT%}N2H0u>dBOq81@Qr^3K=)MUR4Up%uaPQHy^I08mi9YfjDCu%+kLAnSiezgeT+*n z)_ZCl&EVqZ;+Va3xFMLR#M~5)s?-hYa+$YjhPe(IXXqIlTsy|H#8rCn1CZ{MW1pd{ zzwl(vc{SXzULh{4!!PB%T8&K0U((%pbZbkYn|If@d=J_9f_XgAPkbjSvXvhf~LOdJvd^5d^ zv<0)+f0yIvY*rOZk$_S1cG4HENjP}Ol45k{7=Ygo5))fz0FCf?9&;YQ0^82-M=s8)G=bRF`okT zaOm;EN~zv&4riUy`zCuTgd>IYcowWqvq)F!oYmp(r{s~7bT(T-9tH=zb^M`jSY^aK4wNxWA$aEkJO~ zHUe|pc@@G%uH~A&P{KKr+a=4D!K>fMPZSO6gFnptbVtEmAG>H19W91}2N_&F_TEdG z5Tg#Iv?81RvDu%t` z+=Q3SSkR)iV<1KbUlzJJJ_{N+(qibOjL9mRt4l)PnN{XCWk8qOeOj%jgR-tE(tv_X zBL@^>A_s5Yerm>sR^P^1sADZ-L)j!u!g`)QdfAVTuOgJU%F`MQw_7d&>uo`{u0i7T z#J@IZA*;0$(a*rS7-U9{x`GDjd3-P^8coFALRz^Snud_Nwaj_$q&F}`nIU(cK=ey= zv8YdKI-Ym85IWR@_!qZGMA%(t5$q6iu45UnzE?bf?OdjQJl{FQ9gy;U5zA)GMSw1q zMp|x7fw?(Psz~QF`OoEjR!xVBhAT&@lwu7+#gWp&&-V$vnm714|Ao@^pQ+ks)CBnW zsS-@-%?;ho7n-T*55h9<#2}6cc^>!rQ3?*lo8c%x+rv=F2IR-+ze5`ldyes)6zaL2Yx9mZ=G;_r!tMRAwub490@ z)w%tL-FcU{xZ5=QaOho)s`de;q^6oLR<38zMs<6Hmg}C5idB$ip=S8Sv#%L>p5hZu z(N?kiKf0DV36h7~?3ma}bnPo9;%-Uw3st7E+c+!=Qo1 zT8%a5d%4^eke4BAY*11FXh+YLbW>e_%~$_7o0oMI7^4u4+B}}ZGxflfJ937n$l7y? zya(UaxfE9@PqS*m<}Radhb9RX&uvsIbQVkl69G* z73*sk1N^rX)lj>3XmU)BZUVKvUh=6R1Sl;@SAKDRh#%LhcD9E%WJ2rtIbPR-bJ7pX z#L~jp%{Az`M+ni4q!!Ph4^@0K*H7aBp2a#V(BR(nwZV8D%g0o@n_Ewi`6i`3`JZuC z4DZu?N&4bqu66UvQne%DOcd*TW#LuB>DzXZz{i!A1UrC3GN?1n>WZxYo3j&!EPb1C9wvvF&X#Gkq539D%q zQ@Lhli@?HxPN2<`i!B}K#VPc}T$i7C#nN`_&|OE8t^@WYmWQ0X?X5UFT$kKYuOfuP z4p(*Xbc-+(Y9-nimxlPgm=otY=E&N^q|X_sb`L?qG|`%X+qP}nwr#un zYumPM+qP}nwrz9%&7GLV-OQ#Ua+iyW%s8js^V9+Ql*1NBa+a9%_`Z2X3n`{Lnrc8) z_o=uLl4%Mb-k>e;%4jMA4Kd%|z*;nibiqDH(n;mhS&97jSvEsBZn9BNtLOGaZfu_T zxK>3dFLo(mIasLQ5`|KaGA#Z&nu9-$>BJY2Q0&?F=brxf8uu&b=@%clY&bfR(kLxw zi3)BVccL(F7N!$E!|Xp8KJlBi^pN}|O+>+bZYHbeu8!P~ zMf;MRCV1UnfzO}9V9?yB*zYFCy7;zD;ZN@>AZs#x4^g5PB(ong5cX;STXZcjN>9WR z%i2;R(ZlO>=_rLWNmpKPN|El1$Ma>Fmg#;S->g(iYFbbhe#lCDAkUL6vATyb{}{+T zil!bt^p5E*3aP(mAj~U+66nmJiLC323xPL=#4(YH@6m7FnhHj?Q=p6jR9Y>8s;Zl_ z*cptgCwgPJo;ix4doMItXGNNLPC{PVob7dzCh{#jbBJ!2UV5#?>)pMK(>ZI9GNO9w zmzb0UsclE1(T3%v_Ue`s-H)Hb`6y;TA`F9NMMlavTp>&!OxDXRaXG z>%8mG5sdc{S6ZEFE&Clr?pl*yQ5QIpd7>h!Ol1r`36NEPNw&Z0S`-Tl*m_KVY1>Z< z)wczes=@+B`yrPJKF!d^bp~~y*=c3@rTGg$H^?`U+e`GItIPwZtAmbYt`!`6bDWFf zx1E`lFn!OF6w`>faeaa~{7rntFmh2@v55-U_cOq*|B(jw0?%6vmd_04fP~(_PAD0 zY?pfCjSb@PbkVQK)%#&GV>x8BK_Vy~$ z)5|x9#(hDWD+EX;FC9ZrYl$(WiMB7-5eHsr6C8uXV@GfIEWj`WG`*P>7LK zIR*KvY7o-ka#LM;&PjSb8@DZRX-b-k*ZZ71q>R{OpTB~73B`&9EZ3inKrg7Yf2&Vt zJT&yp`r-6E1w9;FggwlnqpyW&JFJH}`cuP%264ti=@0?iMjv3x$79kz)|~d9aVpo+ zJoaX%5528b6+}~6fO*R@oCaZp!-E|sv>WJX2Xi|YQ$aBRs27WNEy55>&{b+ zPa!9(P@nA0@rY!0%gD*nR1(0RXLtG$-h>5d-3zCMA^?ng(JC^{7sz6j$vNc-`XDaf z|R@Y^IX3=||I-IiZ{-6z!bFVK$0h^W8-Vt}Nu)VM@G(9kMa>4N-DqXmn zkXRf{+%E}GlNd=Af*9HB9L8^gics}x$1?rjgy<*m$t-&YBm67^xvR0E&Bnu(VmCE( zf|U6$V~RI%TVSjD`yZ}_B<3Ea-G`fZD@={V?Ma%cCmh?cPS%pV3d2v|Uhec&L(l#z z|Fw7L2Q98sLqZ1YkSlQo_M3Ip;Y~uIvLfqu1RucKlzoq3a47$b^v~pow^w3KF0ym% zQS{w;Gs#I-_A!G)T7mnqd@nHTAXP-IEeQVXRHpwWpOFsa+{61cO9t%!BF+e@Dc{rJ z>6`|7!c8-__bg{{TnU>BZlnY7uen2Rk46D;6`4S92_)|-2@4rTw1xgaN9}|Pp#U`J zsGrov?cC_v+g@Gzlp_n)aJP!quYGt<5%Yhl8)AyuKpsHl5x^kaI+*sv@G;=18w%p@ zO9viZ1K??%*Aa31<{R%#V;Y^>xpGp7x>k+eqP$MliCu*$NOtZaQg70DQ^9xeinr7+ z-I7v^OAf*>SutvGk);$AGCfNZT0wbRMa14-($z9Qeo3RFT+@Jg}k8oa?*V`AS68_!b9_(YtRub5~9#FF|=tgTdva9M42a$ zY(m(heRrgH=wFFZ>f~o1Zy%P1%T*WZD`94yC6q+o$7!k|IWbqo#7*)P6)XkGVh#ecJWb$(=-cH|NJWBmgb(56j6LJnS97WVBH{UT`Z4ob_opvtFoQHZBpo8Xt8h1et_Iu`}WnJLZ-|atIxqq zllEQc3KYMV44$ zFFvI?bwmMq$(7P9BNN2$3O_1qVCB33BxGKi1eQrLq{Id(pTVGh+0a7P&i+ft2VgG( zQa0WyHw)hCO%aqCGCjMujAl@Sv}Eai0Y583x8UBIqvWOvqYj)0|BC4N#OZWQD<$41 zk6=n~|M~PFug}0=xp}O_!_+64Z&vgAu3}`f-yIes5%0#}|Vvyx>+O@uw{_8K^sL zGveO3lw%kt?`r74>s58oLrI~13IB&Wo5)Tgli-OgTa;mi4W#$bqYVx-q(y3>dn@9fi`FI+cb+4B4REpD5g6lx_ZG6jaLMC8};@8w3;V= z%laNshTxmol2h(A@qwTucPsWFPp7-(jp>ucJgZ#!m1CBbh!2ML==f+0y%nQxS8*0V zybERfU|psws|)3%(K}Ic6nuE-r4KM;<*_Du^OUu~#Fo+gpw#K<7zR{VFM6xiR2}v*A&Y*_6?vUTb2YORa|x-UsKE97BH#- z+dC(R21B>qFtcJ%AeRJapRp+R?;kf<*YPiA4J!&WdSW~t^C!D3)mc<%6sDsnIO>}o z^c>^S)CC0*snzCr@yJxmF>g840EU*sh$x&t-r~7QndRWvtjai~= z>KjElxXHcRceue;bUuM&baJIIgjwZal|X5S>?n2pwct(H@J|XqdyC~DB#ZOh1uPssYW-1% zZ!I%=omi^GFUH1roM_q#>f1bW$GDT{IRHok`-nvMW`Sq@!BcQ1&O7t2L~9AYz_r*q zZ@|Jf8Y1vkYgA#V4NhoH0u5DMZzFvy3kE|Y8?ceSC02=_5hHUA&onDbW98wTXk zToXh(F>G@_lLLdd+Brhx6t|WrCmno|kzH9q8WBGtW6;0}M-#m%G;?!{(@kT9*ysVa z?J%=80n$7DI&mm+di58HHEuf{?TI*!844T_aFij6cE*Q4i1D|vSsJ!n3NoTkYu9@b z#RCc|5~lOGXPtZ@?*0J+aK6+XsdGCZB7^T=#Jpt_8T{6@F5Tu`oC+{ua{>o#zh9Mh zQHgoSzs$RRqrQ*m(F_SNqLx!Ye(v;7*&#FzmDCvp)=1`OCZXMvTfEUQHs z&I4iE-VoC@o=WY(aD7-tkaMTMx!a=;reF=6HHgy0`8STMk-7#>I&xP$$4!#s0f-^U z3#e$DDx2?=k!+rdXc1-sZ?&3#0=OHc1SG$Xw-|nUg)hh5*_9&K=Rcv7HsX)b1hXyQ z9=Dx~ANLXXs>}f+M@r3W!vkg!;Cn#DloAh5U9@eECq7sEFvlOM;ydiV?>)a1?&_JD zFK;~v?nCzeLRGc!M%pGTjGO~T` za3Ds!pyvr8B)pF$Yh-|d42^GBYr6 zMgiEp3Nif^dXkt%MA#|6tX{8<4!5>9NsWpPSn(!=E-fv&f|jc)XP{n5-gc;rPqWw5 zaU`Y$jUOV@;IBvADHJ<((-liy{(im)8=Sk~T2=7;gaVW*u~Wd0J>riN!cdj5cphQ| zNZMK6+$CCG#$P#2f~2_wu=??ex9IQRoF?3l|M%@n{7xLlzPTGT7KpZKsH6<_nv2?4 z>lIkQAp96N3Ea)M#+N75ZyHMEy3823HT2JULzO+|n9c%mQdm9^NsDgUX%c8Vr13bP zbC!n+V;v%KQQ(ZFGI+t<1^8c%lFPUE;Y|D^o`JA9nA=0526+A13W23$s6R0y-9!_X{l15s&o~>&PcWr5Q z0k0@kU+98hoM-d_C*c%$lbt038`*eAF??s+>gN^%jHcqW$$vDbC@xEaSwx+Itp}jt za*t>5SJ@5k>;*kNtuav4THU>St6Q7-nvvzYSOR;>p$B-V;shGv7wW_RRrWcskT#)I z4KEZm{Q?N>2KUf|y|&d3QznyTzf!E+j^{iC?sk4gk0Ia3dH@m{ z>3{}X8%V2huomsw1>v)+10W)jdNR$494@CF7pYPf1oM0?P^ zl@#aPI?3p|U%7~da}%5jkt6&pHd+m!D&w2R9CO*c^e05S&)e^K|b;}y}+x+4OPh*}Rpy?^c2`ZrWso>8|zuO^2#TE79 zOk5Dvp34`6zI6#Sp;Oi?U3&rbr}gLckqr!}jXt36U;#tj6abL^+`D+~M0-mTg5OOq zRlexyjqK`0hP*bst+>AC%n-tk!19chs-&v-F-U);@B?YVi z8jh~LlHK(s=>sYXjf6L+gGb%MZY>A!FoDuXn5N**-Qg3UA+goWmHb#fz}nf?E#ZS^ z;`dEi)(R9HPw_ZSd7bGFnrbfn8_s_?MecDbkHD*jgV{kq|7D}P^R^o^@RhndC!HsZ zI9i!5dYsW}Z+xMGUoSFnd78q2otYCbH?x}{t?QfAFdQ>%mp(7T`oXOjHgZFvAN7uO z{sF2t`HM7Ego_Bj$_cPnV>VkND2U?C)+=wvHvFeoImTcrqEs}gu~cUyD_g)&iZ0Ob z(K7HB%{Wvj*jM}t*Gw*2%78gQ`qJ6w>9F6!W4v6nTGSX|6up)xQfO_zKme5Gv4^Ll zH|ivjnCsRt=5_jflGf&zqmXH7{Ia^1fZt%x0w@a7;{b+!SzI>K3u3;znr2W4zXW2wSK7 zUZu94#%E)^inpkInO-Fy0BE{<;@SZydHLybie$=G7_C)0t6x`wn_$rqXTu;~w4M^k{X2t7N>M zmO(j!=J7bAE*4PvuTc3JghPasOJ+`))kQ)J6ti>>QbfMErBM%SsAGnjYZ;x6s)f4b1RK-%osbNlamk+vD2LxFob8keCx#DK%mbmoE-^TNI!$hW(i-;$r}!60Sf5#ZBxv%9C+tXlw5_U0+PX`kbFVA5?w;_DY z71?i;NV~HYJ~E-j6z0k$jl>G9S|w#Jc#yepcsNz%|AM2k{4Y2v6AK&5{}8DE;i#NU ztgQdD{C{y&W`_R{9M#iAC2Omd?N+J!KNQuCQfqsMRKT9R9o-EI#f?I(SsE_w2e6&SOESTx}2*cKy%vP7J+qqVR9647{DRH+40r2rJ)h*FLq#L z@R57b&JI$|pJ#r1c4BE}77W0-8DPALssSiDhlfxI8=xnNxF5zCwz&<2LlcNauoHk5 zmeK{*Ke{tAwGs@NnxsCotEnmJ>d!u5d44(35Ud+jeKi@#zc~YtNee5pZ;w?l7U5TI z8o)$0{MQXH{J_qQlv36~mRFS$u654}z#f1{pw^bHZ|N7ET?>M_AKwx|U0R(ytPcs` zyk@|_OpuLDUS3{}+Kru|Dya&!s_94no2ABP8z5J%))ioU3Dp?>vx?z17A@W}utv}i z{Cx-2zc7Y#d;#`(OLFtVt^~L^8G$?;=iAOGUdZ2Xtod&XXCDmcM;yy4e@nkNva&J~ z06ag27~>einI#M(Fu=f#-Ty+LX&in$@YXJnz`wYw>goQAZTmwTqKT$)2r&D8wv z3jX=~E*21XXUF?jG3!sKwIiu0D+YW0<}RGTza)nDoZYK!kGIL;Te_5_mWG_Rq+p^D z0+UA^jg(x%1xIs0Z@;hFZw%S5iUMNa>E~{s)yetM`T2(*C6z4}Ew#_e?%;O1^u+w=5H8W{i~ke>aS}fR6#~Wp++PQP zfZoZfbr13{Gi}c(cF#zH@Y^RRCmS$Tz>7V;FNPH2QQ*M^*wY`tK;13A?~Wh!FG7Li z9RQ8()L^_^xD|R*f8hj&HU|L6FN#p+x8yUAF#pf0ib>$LA|$D(hrJ)jIAXB$8YDCt zc<1IOB>;}#^G|i@j}3%teckJ??7}bO+~2pUMBUZV1z__pZFW?4a}?UR zz!I3P?YFDhZ|bC9VEj9ZTWceTz~)1`_{B?lL)aAE@pt{3uNQy@XS@5aBq+M1BZqi= z2DbmF4I4r!{Lcuc;@7ShfQ=Q`hVr9L=FiLew=v0)#jUa4q4~d2*?S=W+FAiS6xVT@ z{}uq=oCHBOfRCS68UWVSv2q6C(ES^@*YNeuj-nqs3VL_|S?_iWd*SH)WDnpEfa@fG z1ik=UTl^9L{bVn8!((s;Nxwnu0M?Fv2qdM4_#hD#&)|ZE<=@y3!iD9(g4qCUFZ>Wl z%13(=07#(k;DYHHKiCfT!hxBBV8D&!zczq~8GqOh0@pJ+*Mv16X~9`vZNmcx@-lF) z5SUSZ0&oD>=lBRhvybq{!NYCMA4&Uds}nmf_^$|`YJdLm-mCvT$kxAsh1AU6;2{dE z{@@`dPVT`S1>Ao%A>fBs5Sa}A=7qz#e>5qEH!k>>5h`c>mg5a(1;#P0;Tyk|d2M6= z5`KH~{%Av4Rxyt)kDlY09E2ac+PMX1RhYm2!UqOs{=}d19DMEK`c(f&&-_&`@L7-j zRVHrcqzI+hvFrTG2^D`io&-38YIFttQYV2mvDcR_@Zk_@Z~y!e0`dOapp?gN(GbF& zAIVX0@Y^r+?^&(tkB`%j3rW-8#^|3@$zKR-f32J=0WLQ5_*jNtItAcBEt?+Lb^KE@ zFi}|d?@QK#A4&d)6YBSnw$R|jR9Z+@6mI_@CBVkdc6^B-*VNe`e($N@UaeowN&M5P z{`=bs7(hU`z$wKOv)rjrL92!S;1qCiBpKDd1&Br*m44LuWM5d;T(&tenR`~BAS4nk zoGsjBA+4)YGAawyJHoX_p)_)&*`zo~svO4ziTp@gn5`YLz(6Gn6yOekFrgXoK zy;@$M-6gkqY%z~rflixjcF{;VLZ@l2YvT}1Iw(Ib!zA$pBpFe7tl-pgbxq=4!zND? zn5YsF0ZZz5h^jd+JrAlLbS<%dxcG3Tc!BYZSQd$@lnkW{=M=3DLCfvXP0~7=kCL%J z8~Yga&u>-fsDIFyU=g>@QnW?n9_I1QOGxSZN?5A*A^nkSdqSvwMb7hZO)c3325Jh=^sfU}|T-kyiqpG5!oqq>eAaNhdGOpCp~HHqtpa&NGq3Km3` zC`|cgS^{p0Iuw0$qrGhk2^)u{h~-097CRpj2HH(S?egHw8B(xO)vt`2&x-KN5^K?p zbDtXV{~H5xza82pK{^^s$%hbxK%;8bpr2H0kz9VKYu2{%Y}z?F{1|q+ol|{2FlsEO z-Xx`i=d~2s-6|_|Cu$~hGGc;Cow*kN99IejWyN(_rq{`@bc>9-qIhq42xNMm=58+^ zPvwW-gT?NrdZESKaN|fc?iQ=*qH;>x%*^0|GPqOe>zRlK=Aa_*UVI?f-hg;sLGfivwJ;^*uLAG>!E1#L-B}*yf6NdSMjC<##Lzo5Gz}^q-7KiT-H47OIYq zF8WfRdu39M<`7JH+Ra3JRJeV-|bhZWbVkn}lU2VZE z0Pa(iK6xVq8(zr=zcE%Ej!hZKz70N5X3k^{ zJT-Gx`Rt?uvNNNP?@~go?W&LJHX#yP7%Hd8X0VAz2`*Lbh<4x4nXn++A_t(wt#d4{ zEng>g9!0D9GVB>FgTBNE5obBePyqt1g%h);ZP>LH_-j!zpZ3da-rXfEd=bj^J(QTv zpL$ghd%;&B$QTu$ail=Vt3Lf=dPhRReB?*p^u*{1vh|Kl<*QzQRBu;` zQOa-xvHf7p4k3N7wfEM7(U9B!`kn{sy;)kG{$MkLIKmn+c6hgdO?g<2CiY3tPDr>O z_R~xMo@pIjw;s=@Iv+gss@|&1d9x^NX6R6mU*p>5qm(Rkyg|GJg&->!+l_qgu5NXC zR!0-h=ZP_31+ei)NdD{m1KZeEOL^1v)iRQ!&mDgd^qA&&)vqa&wfe0%G!%c#(kHQl z3_zz0eaXhzvhaoS?K4;E`_sE`hD9#<+6F}}{?|k@##tIU73tX-it4(M)mS7jVeTpH z3F!9CZH{r6wMF%xhB4!v+tX#s030~uyCxe^@M$G2Y~V{FIz@+4zr zaVwN3?CZ{f_MV$my+IKfJCzknXSk)$RkY@7HalarplX7~rIc7(U7_FQ_>W2GAB-Qw zcEF)zgPU*J)1 z!~CBP(YNEL7tOtp>(B@8oQX%kIv}xajfvN8z)S@?aN7|CXsxc{f;;MEm|Av)LoLOH zN&dd9TK(Ygnn%@kdLZ}{489D}`el?^&$2$(A698}DGtKKWeHO@vQ?Yo5R1F7%VukWSVta9T1;&d8EM`H*$8ZY1wb3}H_eA!U&i`C(af`~e>)Bq(r8hJs^tWhVdTwLttvuI?be3-? z_|&1kEE`Mv*z9i$V@&*!U9gcJfamz2)Mg{FU7G_kv}GD#%LC5+1oiRhbNQ!!_vk^7 znEfxFD;cbK1#pm8$e3P{@vg3N8AbjX%}8qbJX+>H32v2c_=$j+JlxxQNn21W*e6P< zCjRgz*(`SgQ6w-%SKJy2^3ax-7Qk_?9DG`2E4T-Zp<8#E@oi-?2n`B}Jq_vcxJRpEH^IMf+Z48_&r>S-w~13# zz=r_G;l}E|{hixe1Kz=&e>z14?+LzMbib6-|M8h$%n^`-q@$yn8Z|pj`&hEQWZ0hh ztPdMK0E3`twRQo|*3}#6t+9pP74cJdN)6CQ0iZtg3v1cuw+uKjm+%p1$t^qI@8;xq zjQD-cb262NJnK11$Z$x?y5yx8s&J$^BtUmwldD<0H#WJ<0D!cu)vw7GVF#OA9e1ah zXQy7To#eu90Y z=L88?G`llYUkyjv2G{2(^lc-B_ZYDeFovC0Z*X zppwzpB)IAZ3)4h2n->CyS(_qL8PH>0{*pIXPPT`v^=}sE^{XU-;zA|(4i@0LH8trO za^b3uaA445JdV*03G1{Cn3Zm#l@hdbA#K+=(CVWA;C)hnHe|7o67g)woHv6~Lu;Seex5Fk=irB>u^6#HM&=2*6X;tac0aR+EV;Ff^ z0uF14K_$0O;DfblkDI|j^43YGalhH)udRTS62<){_zNC<3)2E2A!8Onud&2bv>-{D zXS`qON~?Ka|5BCQC-*KDISZ?Ka(ul-;4t-?L<=;g4J_cvgi;@Ggx-o;^xBRGtqk&T z1k4cIW8&T`NM8%$$vj2=W3o)VJHN)tFmprt&AwgLU*Z|f4|I;L2P~FW2b9ECK6&)a zCh8i%DhlRBiv}#

fQ1-U>@cvw$D#CbO1FBJFT5TDe{F?|f%%4vMyUT%d^iOk>n8 zuv0!twN~fW5(3sDo)g|KunIDwci8^O2;CS&_~^K!<_M*Uuig!-n2#3abGt~yv*4Os zlA#gTN|4C89cUQf6dBY`%hs5Ejbp?If13!>y|j~irFj5e#LITw)EDZk_n<5%Ij@Xg z9*ErWV?c4E;!aZDVKhk4$g}2sq7AX}mf%_GcH}CiG}2|tC_8ow#7|1tJvR;luzweUItG+Ej{H00CA#ft#aV`rcV`QqS;X-SjS6Pl zk-E>eno+AVU!76*69R>My2zmt&;K%_{3Wxunb(_&%>m}fs07QWh*A-`%+@hUrIFMY2)vo{DJSsO@ z&LlC*dl`ffU4iz!TbO=R2(ILh-<1lojAKd)Fa%B72I+6ii8~SK_3m-kE)_X80 zP5K6RW^_x{O_rSZ*Q%>zvK3+}Q;?`hMBWO-8dlkzae^B|;xl9t0A;oM8C4h=dnq>v zVC%@X|3|3#ZF4-4cpOIB5a$!{1}{VE3U8%vu)q|O{gigaY^H(-zbM29oZ@KHhZjw6<-P?^LpAdUe{{ zsLJ`rnJ}AW)ciAM2%J$MP9V7472$t#)!#^(D||dZDL2-y)K|25R-TP<+&Ig|^uR{N zi5+#0uz2Sg5IQX*1!LR_7zkY@7Uf5Caq0*|X)44lf&ZX2=293r2&&|b{!u4p;Ln)H z@MbbFSA3-xx72meq{^(l;2zCts<&)--Wn~e^V4GCLAz9QXfjk4|rz>66&Eb~C^+e}$id|cG%`-k3d=w{HoVY_X5^g}M7%ywa97oKYcf zw!4sM5$F2MUL|YYi7%xUSC{>n`*vh_?Rn)jk$Cxu!*R@fg3lvC)p$L2EWS&qTYu7` zbEpr#0J=29=RehuRT23)L=#Q>Rv0E3L^gU`*)Fp+7^h5= zR|w}EU;p93t}t^iGq<^=+WkO1=&3}>*8Md?A$Bsf<_1TgzDH2S?rNf#7NfJzhJEPC zXcc-O$d?3@tAA~4l{8C~2|tBzkij%)t7oTx?xT)SP%Ix`gQM-5C+=uSl7oUieq;g_ zxAj}Hv+kVekmlmAb62YFZK;EXD#xS`h2B(fCnd}!JDmP+A)E@E)a8)AA6LOp{6%>X zC@gYvBG^j&op+ZOh?N`OEqPIMEnrBf9fxS^C9y{>xW92{ib_sr_QL+0&0U^>4ZwC8 z&Tbq-;jN>s5wfpiTOEe7$UX}Z>RIW@$u}1 zH~%(rt1Ki*2%as&zFdnOC$M|mDCXk) znaua#v7ZB@x1R=mt78~s5i9RA02rT4i`;rK%WZHkCM6|)sF<%Q**B>xTP$S5Vq&=r zx(s)oDvCTXB4I;bg{W~)71F~U1fRYL&}l=g>y(Y`qoAUxEC2&Bha#xv7t?yUI{)8= zEmRAuv)s+Ds(!N*ZX2RZZ1Z$LV*%{5<$u!b^?5K@TMY8oL#7Dor+SJ zd)$t=ZI_nrHnU9h|3tnRdMs5WxtB9O+R=1Jj$fV4IbFK^eaYu;kAo*3(C;zk?SfCt zk!V#n)3VuGA=r{kVX>fR9zYjr6chRK#BCd4KSt}?^>sFiDO|l)w)n)EteWpeUZbeQu<`XE(LW6Q z^zcoH+tXLkdhv~6vN}e96Aw%O+>;58^-5Uf*)T{bQ||C_1eM4po%jsPTPXd?(VS_O zegkTL4>TLjS@UL}TJnGgq?^F72z?)H_Gd~wYVczMT3xo>aF8G?dsb~W8U@k7UshiY zaLZ5R80a$d&CSAqmS%4rIs*iKZ;D?uTk~$m+T_m!QwOa%pkQOjMe8?;3K^o5uDm~klQisbNO^n)?hE`hZP1wy(gP3v`ooN(2J7vZV46*V|qxH#{TZ1L&uT6XR zy_Qt|JNrE;V=v*%YN3#rmxrnq(uB(aerM>rRB=|(b=x;(fwquCg-w|45>C85;Bq&X zD{=5NixUzh4J&Tb%@;-wbS|2?|NAKuRVGfVswCj++^i1~xlQdCc6dvDo5L!H2h`=g zr^5}ZJpE^`q4vghc8sXp&3e@|WTmbI95jrvt2>_-uF7#CzCX5V+D-p=keWP-HD2?c?E;M4pp-JmI?X z;3QOZaS|jYFX3Y@z23eV=mPQO>7vk-n4WN`{#sbXQC6ej-_7SL&&EYJ(C&6J+WPqBP z8ww7c$hny+1yr}57>f4w`MM#Idn4g;8Hd6ZQLhU}C-Dh;)X%~!f6u~I4c3`fTC>8h zuQQo9;QEW3-YJJn!^;zGHXyeqiJ4^$zA;n294>Rt3C9Fboc1TuZbsR$N@0!r=-H-QDoXbPL=GXxp z`oTNU3zvPYpF<)=mo#M6Vp(FV*_kr4yN{6=1;M*Pz%uXa_HVH96lb}sJ{GwbuHvbP)3pwPOF&MzCg$^meY)_MdVh z5w8A0Bac)vn9FUEDzqKT<+XMOE1+%Uk=J$&RIPOrhly&)TS?g zD${9p(j{vl&m+=%sFM1@OETp9C-8;sSrC({lK-pMi1`Y77J1sDmvA9?qCWg%c|(PW z`|l2>uWU#v7B)=2_RS<-!t!)aL2Yb^l6SBWQ(ix+6c`tUJ&StKG~QYn4%HOa=?ewx?Ib9vS&rDWQcw z&@UBx5oTM|RF5PgAG-x{J+IbgAnNmiNF~IZM%1aBGgx_W zT6_`v1Zx$t{tfMysux_`aW!;|SoiQN1GPgi2`Ty{a1RWr@3V9oc$s^HaU~T)A<0Z9 z3jXxKa-%Fh%u zwDF-Im5{!W?rnSArjV?2I>K|j-bpj~-?6V5h{Nmr&1$Et$@{xH=;&!F!jfxt54l(4 zY#5OaZ3v&&)oDgjq(9QTs_^nr5g;ZTYh>^S6wc@|3gsfa8ZT$Dh{C0EB5S|g=D>3kwmtX?DQCF^V2 zqS)(_+OUN>I5Fx$1z{`L*jx}4y_(}3W$rDfb|0sPkb;&EYAWxoeCTTDx2px51$SMlsLF@p$yXXXCa|ay~G8`0|YaG!oG^GQiVy6Y+X6aEf#iK!r zHBsrm;6pr;osjsjbM(tE$nC#i7c2yGjtP2G4B(G4D2p2G({ZI(X@hVhEv9^d*@gviw8UjECox8U*YK_ z&zia~%5S(G+b4z`W#?pGus{>dj8DcgsctU*AXJFS!r&OOkZ7Y)8FLk|Hq@=XJrxz| zXE7^>!-R<#eC-g&tLk~Win(fA)0%smkROGY0RF{%!G6SyFlH}2kU{f5#Eo17V-Anj z*OV0Z*dj5JcjjEt;wKsX7jkRmfSh*iAqX{*dxF|2bpRR|0eoGO;?t|mCs3!*O)=oY+k_dO!`UtzkBLfj2fXjEA zVBJt{2fJye9+Pq+LxQ*QJFN2QRXz1tU%X@9>x@5Jr)lt!`Qp6=8ofiQK!6X~!xII8 zQzj5G&aE=KO{;SJdnPh3T?9^Us;IYoN@%t-Zq9eLStf+t?CISSM89x{;P1Z77}iWl zjA6}cpI=$A+>uS7iTwlx>R8~}Gbda$3W#^~xk zb%|dD9R#$+rzjV@%>3hXB)vK}7(g^Ilxg^WB5_w2@>*>(uNNZSy38Y?g8C8&1maIj z;ad9njHA>7D`%-i!+6<*CI=GrnwOB>%}1QF26(EFQtS?qq*`Y&n8+VBsU|23i@&JP zi429_9YN8{+rl)#=@0&DHZl!|X2As6(c3!;eCpY_MBfgH1?l2qZWHCIQH+P)izmGE z`t=`}16yo9Nt1=NgiGMt#}lg7Ok0TicM>mtJoLbJlixU}?bM&|6)MKeKWmV+bB{sg zM7N@$i!c+FTnaC3rrx*JR_PioXVWwWmt|0t>%IhWBd9A+mC}v*n(hznO0TUyjnG3- zG`0(Pt%Zn{)dG%&NVC%>e-iRkJw$9Vi_Fpd+$4O}py$bueK#>1H1v=W^#I$*>JS6l z&H$I`$0I^+C%a{irs3yDKhV>-P;*sfln~G`-6ZwT;DM8A3;L%kVT~y^Sg!t)Oh@b{ z+`+@xFw)2%FG1`d;9!e{oTdi_pjQK(Vo$KHF;Gg-EKydkrO$pKA+wz;;*iUPPHEFg z?(zV0Nj|VmlT<5)=;8z&^wTriZU7iO7~P8W_{kHe2iQ-r3`=GSE$*GB|SR+<^_=>VXxu=})U3 zyUQRx)~t-Nz8(ZFOjnUVk-SvqWVc>z{a(BjRbSPvo5O!^V4e5AblsG*mMYS;F}&1R zq~nD%nM&_RHAV8KQ)|AhY4D5sn=^|_AHiR4T>N$zqu731EpvhlwYh%F^H9BI<0)6Y zpN#e>EPZ$mZMD)*Q(lY1(oWpjj>W3)+UXLNUFFAVnl}tZk+ZOC@9XvBV1Q9r2oUUo z&ig-R(jmlyw9r9(b!{qnjlNQ;G=(iPy;NVGm=!boCgW2A5mN}49>H%nr&^O54Z07< zWs@dw(kRlKK-i&zxR4XAuJ`tkWC8yNI6%k0PG0#Ix*vsy#mqgXJ9(zPBmZIs_AlZ1 zFW2S-u@kVa(^sgADt(Q6Xl$JWH^#$q2880fdZj}Fq|fV&I93JZe$(R#6GiVg?$LA{ zoB;=CEPUtInls#|Na2O3mgTARa{SD&o(Wb@M?H%Qok(dz3W?dlOJ7R2U|+xZfE&}& z%RU9fAx<4uWQ5E;r>}k#7%mm9?41yoSYZf|C`eLM_&W3CiN)waI!H|Iyxt=uIt>Fx z^>q7akK{CeH`Y~|K0&tq>2SauU$|#<>-(&Tr?}e>yzr<(L(E@ib7HV;AExqX{7jyo z&NopJo%otgNE2&?^8!5@l=o={DzV)O5#B192c+Gm8(34deNaUya3Op_*uy_01g+~B|se2(v6Q3Z;|-W z@s{|>L(D5{hha|A-6gu5WBVTG%iHEq3!|Dm<+@nmyl~>H`r`HT0OLGDB*Ki{h$gh zDY}N9te$X`ISR#}oZc6>Zc=NYvEI_&@6p)-ZtY)AnipR-!&?y2^=47Y98Egb_pS!| zdd`K{I}&KSX<|-h5Cn6&(Y2sWYlx*$>Sj!fW3&g7b8%OEMAe=!VrYrY!DF#UAB-bl zni!aA2qwBp02SH6W)#t`ee|_45+JpxsBWn3Fx)16#ng2agCSxwiGR{NaI0j~ZlJmd z>m5Raxbai+y_dlPhqjW78r-+c8BvyU)%UOBcn6S7T`!=ie&rGy$8jG@pD9PTnJDpk z2sBXGb_vg_^z-79!!Gd5#J%4a2?7Y3Reih&xn~)|!BrK0dE6d36VIpjY%*1?^_0Ri zU34FN5w>9(4Fx_BUaqv(zTog zCJ967=KdhsaPRn;@Af#QhVsIw>O*!gF3%Q1^I6!VkFJgr)hOZEt;meyr?PSUgf;5| zj+%ZeH>MuW8cif;5RxE>dPl>*V0(J2Id$5=DkN~dcK-M%4eM(E6W7wtFF=9QkzKDO z$X9pL4X2*5O8uouu(eX_Nd}3f1M28YZ`b9p6E)Yp{PsQSmsd&FM999 zc_V%WnHy>uSh#`cJg&o@S1wY=I6N8raGZqgNrjw%9r>x_b^Mr91eH=#yS=ibB?z4^ zMY+GSbJOfiwAncN=wV1efRj=eV)wq9qnA+N`>Vc2w_tt2a_>vJgG(hf7~|V7BeIMO zJ8%3^0AhZqAmJ$=LY>JBT&2BHalOR|~SOn?~w^Vi)Tvp~@a# zXhQ5$E`l3`yO_uw>T3!*6|e#nuG69-f(S8j^q|wJ<|SZ{pFfvHWg&*AbEqJt((>Y)H#_AA@5enS_rap1^xtS=meH&@qQ1vOtBKrU z&#px&G8!fK5L;W=JJ^9C;8E#ph?0}6U>4kQ=A!JWtA=yVqTh@Uk~OGwNl3d0GC`@6 z49Md9fg9W^v6{!-yN3!)C{ep>YG*9)crCEQKb8~qeH^8ln7=S2fIuLyDb#gUJdyIc zVYY7L6+cTYCn?hS`IT9>;lQDe0#@(2b!eV-g#8u*E%7Y>>g$=IbZ9?6thiXK8zP0C z0dIws_h_kbXhc@x+Ya7>?faKXF1_*Rx@e5X$B{$BMYf|qIcIpAjL7`)6B&ep%sie0zYEBDg+mH zW>|EF=j$ii({A(ba$~9SpR$tJv4t@j#ANA#_XAnU;m0--C6O`2g2kqp$^GNXA!iSL z-#bs&z1nIRXsM3gMn(2I-G>U1T~-W{DU#(jEn&^*WWK&5Dv-^A2JhmucwFY!+v^zpN-J9KsERVJ zb2SaatlLBct3An6>tK{7OYf|HvdfNq^^vLf&Ps{iX#!^TV=IRUs+as~4!z`KP$nE| zQp8z5Y4S~XJtc<;BD~%qwYHUGGMRG0iSXMKr6qcq@NoS^O$?LiBsMi>l#q;U#IdHL zK<9+)O1rG13qnkeUh6@m?jkZ%iN(=oEaB>47!TDHeUs_Y@bPtDLjpbZJ6f(W?RHk9FBGBKwDw;p?{1dCC>YU16{k))q1~V#Uu*H8Cp0Y2Fkp;f zs%W15@&zSie4*EOOyW(|Xj4z}tJELu{?tV;d>E%~(_#XE_}K+PdjdS(o1@U#nT3I5 zYt8)q#t0EK$?_nWL<5ow`XgVu28N_GiF=h@A&z-O10}wukd(QJ>Vg5{o|zR!sc&ko zFU+;^+(i7gst1R(WH$Ccab%6|gsaA;2paDti0yr|f(!LxQz)h1#E?p%_wSN?Tv4X< zS#`cN7n>9Jrz?8{1%BTvp=u&DJzYvH5B>5Q*C#CNT`_$~=BC_78AHPyspwwamWxAp zxWZc~CSsh8;#Bv%E9#W&cR;?Gb1{-{Uu9|8g zaocG*)@2GNKhjc@8njZMmb6Ld^>HdUw`~0UYp_2fTPCZ9l~C12L5Gm26f+*{hUI-A zJ^h%5QQ^dy`ti8}1r!wJ8^ol+-2`$}8x^vaMIRjds-h0JP1;Z3gG@gY_Dw)9u11!s zJV(tN|5d$%PK=TxXN6zd-QcbPJk>XKj+}ESfuur`Z+_H6{|ZQpDYddjvc2QE!Lfx4 zN|l7(*cbALW9a8!l=yXcsn56rU&MlxSY(4h6jGQ*W4Mr^&)$qH-(^eWo~z{!U%=~l z@C}#MQF6BzeWHd()(%;e4JW+7@NgzqZs5^p-PlfD>w{mr>*;6onrQkL+*iv_FC$qs zOH_J#Rj%F1c0a>T2UgzB_#*`Q+$!96Wq8wlDN)B8yrSunt^PL1@brn#L`dd{dpmZ7 zb&621*2~_VojgN)&5m<^G9z)Cr7kqo&f}9!rSLhqn$Rg4&l7%7_)6S7t23C#(cLSC zanN$y_Igh&8Vv(D>sm254Tj!nqOo7($AR3R zU%PtqmrvEtDKZ?XL-xO4u)RCd=a-4alpGTnEI&+0_^1kau6-)TIU}uQ|0&kDnQe{3 z=+{{}A`Dx3-);FtR=lk}-_OAQE2(u0k#~ud{?HsZ+Tz^R@#VlzieKDcm470Y?`S>a zyEt4OD+y`RmutP_?zJ&}D{e!wRA~yJf4F%Rvv$U!Kelsr9j#hF^6kI8w3dG|rlb_^ z3@v_weC-i)?}E143a5@g(+?m_SsNR}y|L9(66F*ifrCl^-Xh!qi(|?+Ni9oyyFKJ@ zA{v^MRqSG~z1aAF-Gv+{pbsNy8+dan2?Mg5|_>v_$(d_Y&89rHmED#{8d&|&~|E(kNXIuUJc<;3oX zwKke!7tK^CsaQZAZKbCGGFa53N8rOr$^kp^C016ssg)#$Q-_y30AT}y<`4duS+2FU z1imur)s>bHyjIU|Be5cYJ8AHN(0t}*S$p_r8h3rg=OGl2xD_}qMypQF+!F^Xe`UV)Mg)Usz0aZUXzrR`Wtab$|HYKNtmY)lJRktZE;mM@Ky5`ci{BbNI4-1jc0@$3+O#uexN z4w9eF-kyQ?>(XvvsbrWB*B$4WH#u|(kHTslM5S~I>?eua^zM8gH}v5Gq=uj4j)Egs zhuMup_4O&!eidt4nAC)a@~^oc&Q*)3POg7p*r;L1rd|4Jj>vc(THN!3t&b=(Agz&H zy3(|OR1eLV@xa!9{o=Fjp(zt5j9%3PWY0n#NON`Qdo69IT0>g>U@PUweG0`^PQMp7 zjq;Qx=AJP92+rmuLy=+ZT9r>$|4b?&Vtm(gtmE6>unBwH=zs+rI#=q3`NF#>t2h)2 zU5$DITU6CTnC76=c}`4id)NJ(#29yYMt%?gNxf%&^lH3fy%x~ zVVuf|g8I?6`KOoI^w|3w#4e9N~IqMRCg z2r>`yz5AcgFj7CjIu*vBo_kO!4B+lf}@-H678Sc@a2(IkCFV<5{2?*B zduIzUwj72F6OIzzvUJs-i0SOS1;=fVg9@AQxS{kKGQ<|RDKw5BdzdI#=C$Lr%0pVE zSLgLf5wY%zKL4y6bv$?sL!1NefnyIwMa6mlndZo{E^d_&dBD;_n#C{2Z^`d2 znr&y)hM1riA?+ko&($Zy_w8i=E|5uUzg7>34-||)R_SHW9p7Au`Z3`)y{K`Tr7exn zSV~39cTJzgY!kwafVm`=inG>UBswx2Hk{QPm4To=aa3yb=pja>t(sWbzYS<^NEBNbF5Y| zqaytpv;^@CFYk0EF+PzEB$UdaD=F_dq{E z!V+4>!t7xj@Ag~2XiTD#>YefST1SGe)Iu8J`v}IvOu5UH4b`Q)TH^=RXI|v%nxe1% z3i5_td(YcfxGu0WY7HD6UfQ*u{dFpdFS_}n=J&)l0*dYjt+YWzF^@dJ|$)kkdCV|f;iZ!OZ_W*eJ!rB{( zbN^tyQt`B=$Dvp&olLr%4ORI}YfXXNeFc_u$`gmKf8%B5*UL8jmHm5dd@m=H7_}mYOj*eqC6Y064AZRAbVlI&Ah%kPN6N+Rh zl^4Ia)fpt#Hgie03ozqn5%qc0+X~$0Mfbx?%uyJN{qZ2!wX_%QKsklo9(ZMDXWw#| zPXCLgySz@tGs`JxZ$%XQ!!!S@T{pGPcp;_1`=b0x>%*bT?!37o<=1uDuPgod-uYO5 zPkeo0vbA`k5-%LF;`$sb!u1lQX3#aPv*K@V zS3Y{}8MNAJOz8tIfD38!yuye1%;(j%eDiL5M$-LMe&ojkXUK`#f!hzX!w(6G;9k=1 z@#S_hj{X<<@+#rI;BcCGoi1)cV;_V;ER7FKY)J+02G!O%H-!w}?IPE6KI+gbS?qj@+ZUq2WscamAsfY|x3sSn z`eKvcs>;XnUR5yHhMeBsuDfoVwY!Z?If7r#}_xjxFIo6UqL-DErPDkJvK{4Loxd)7p*9 zSiWrQQNks(?};T&)2ZwlS7GSk=mEn|0N?&l-PF!qVM9s?t)Rk>G2Mz^^fhb`y?~M# zde0xe9@@q^RXv#W1yCNse`hxO+Ge&ziVO)o*k>!|X201E=aKr@1=*t1^Lw~ke+5|% zQ%q%AgsXMI_cBh$=YT+|O_R;M(?G~%KY)`hht#YOoB|$q{H+z28S%4H(_PWfu31+XHcqui* z8Cx?NO%zacc0pFzd1vyA;ox&z=A;AM%Bu*vCqF3i?guA{2Ob)SL!O4az>K6z{n`K@FyM|-g0Tx*ti(+GYYl1dK@A1v*pPL4 z-MaP{@bi=}H9K25!SpYeA~RtP<|Q9trY+sbQg92Us_4@bLMY4qDbk? zDolUID)f4xUhf%t(YEWl+)D??1gA^d&4zJg7fR!%8TQTtxA@-wy5~O=RqV;e$6nLn zAiV;6XM^SL6Lh4~v3#qY>WW+_t$bFM@Jp@pU>XvL(k|w!b0d7j9C{5gW#A9Zg7g}! zt~lusTh^#HKj_U|OFq>uSnMsK+!4xZ<}H1;pm~RX^GS_@Kru#k%g?{wUmdpHAnUZ= zn0nbfN50&`JDq@V;k`K=(Km(DPpIZubg(bSnl`jcB$Db%XZXVI?`60;U?NzK64M4~ z_gJV7G`A~t-;{eFW2lLv6Zx4Fe4@)TUt^d>M!ibhPjl{t3)&?c7A5CAM(tDmHN@Rx zYiycV>Scim$HATflh@P}MQ}jEclMM3&Xf`=JqeK7v!h?S5nlI##ab7)!zqq90{Kv} zZ@9%6XnJd<`i-nUe0rVP!6K>Gk(P>eldJ2>H`0^Zar0wlSZxU1a|lsh9U8{B{0 zU=wAMqFHlo?ucHITu{$}P~+#t$;Oz2m-X9V2$ez($V0EEwC?QoX3|j&V+Eo+-`?0^ zIQE6AmRl^4Jsol#9bkEV=T0~!_8VTj8#CWob$jwd*z4aO7aFg9<4-?4z{4m{?Xp~c z`_I3n^qhDXTd=iax8I=#GSFRjgwjoVcsoQ#qy3m?8~U?Tj%#jha>{$`?B(B&HQC7j zT4K{2G3?M8UBR#%3p337tM<|AJbVJULo#lcq}ug^p7pO<09l)Xf~Xv zrw&_kp>KdO9i24l=lRfvxL0fAX_3u>@6@iqR6IBJ#t^Nt$gG`}jj@?jKaFLXSyirJ zFUE<|8gPug2Q<)Av(`w|JW?A@Aj=h8BC>H*40^yb`({IT{8nEf3OIgxfkW24zz>J%y)3j#DH3uOW3XF}O@ zlogVi?LmSVgmB6)2vg#@o#uH;7@)|8Dnx$<%4#CK1!``LvR*>aj;Qs`SytNL%TMu< zlXM2Qf`0v5%BH|j`y&6NUb^RKtvknU<>>^rfoX~euJ=e=Fc5PF+!zyGUF2M!Yr@o) zWl8Ak6@tt)(Ym<+bSV>O(qIi6?5r|qwe~~Y#6>WV8p1r7g+OvnbO~JR5KV_!YVDM? z%(y2B3B;WD6#a=cozxBogj%CztNFaFEbJ(7-h8&dv1F(T;)FdM$VwjkU{2cD*5Yqt zpxoh*)E0UNl!i~=ez<-H?H|m-NSy96d*_Vzf`EDc0c-QIc&+qt(!=e;F4GeccB9`Y zoCG#nb!71549Qzb;;Wn{q|?p{i!j+M%ti|848ln5Kov1E&?G@}_aedaT+lvTVAu{# zE#kqYt#}LIpsjCH>fd4D9bv=%*ub$HAQXqx_)muK@@N_-B>toxqqYHpJ%ZtKqUsO_b0pO%T*Qk;!4;Y&us9LLoOH_J2CmD~v;BEc#>_=NPE4xHD` z(uk{Ti>-_4g+;2Ad?G%!ctTC~lJO$nLKDnp#9|5x<-@!Q?acLJ_D!~0ZbTgmKjTAek6vp|ahc6G^ee%J2gF?-Wq}=LnLE5u@53kHqCTeZt^I{*UPqA{BoGlM^Mb zl&|#%_L2@cyc06b^yIf+19D%0?YX-Acuu zJk6LwWE4Ub<&EwNzz=gkko&s$e8|s5`QRwd51#*W2-x(?zVotuEHBU8&#X;AaWwIU z5l{Z-HesUeRPX1cM9Z=CkNa$t?ie%&xE;kFMi?Qer!JNGnsc9bfG-KOh*z)|F68177>^n&NUg{eFQ(6+75Rje%m*h zHyOy^Vy`D%a0CaAe%rvHYz&~d;$kSZo9V@7))DHzJx$cVFCQoX*%RKD08rmB!+snZ zH}j>H5 z`1O;Q7D*wjyW#zU(m1w4pQ?g=n`2T<%z}apEH+6DPDU8~k z*hMllQ97l3gkZL>qSmKQfwo=M_Z4g)^FC(7SF7AS&$~w`C_ozxV<&$nlQ_h9(!|n2 z(Uua>9BnmS>xO)3C(UTTjN{(Zl*|#66k#U4z8x2u&s;<;ld8i>%%#~$m~xAAe9gRM zpcy0%?Rjqi73r0MHvzH@9pNHRo|OBxZ;6uc5g0QD9T}u7PU47PPoxJN(47CqT^h@Q6BjAKV8=_exnSYORW!BU8iMp zB4+7FP%iX;W++C?M8{M~S86R6$ADf%$LPb$hjcPGam2$u10V#~dv^A{rd_NnwCj=I zWABu^f}5~5Vv$v6a-J|C;8%EHx=iBci0iyT3{{MJ z6gn@!&TYpD?ekrzk>*jo zp_(hc33^KZfKu+1gf<$yVt+dpTCMv)9rUtt$e#~<$}NKVUxZD4ALNRHUrAPy{pMovGgcAYi2B=f}w4asra9;x!-r~sh%AbeIAEnUca4qSIv-WeS?kvX=`kdS z=q^_H8Vd#E|&HndztxDf(@B* z*0Q)KzLLw|HLRO%JVd8{x?F+V8!oBZ>D5$KUr6de5&jw7f$JzY8#!899 zjOqhf&C6P;03MK#x_s$W=A+v}4Lk;c@37_Kw1ZFS2_;?*I;$t)WsLQ+mk7r!UbN_@ zJj<87^t5{_Z@!&4b2H$B^2->XC|TtGD#mON7LMO?|IW;*W0=lnAMAF8b+q$1pTH;Y z7eWZ7w^D@cHNC*nSDZZw>2Dukwi+nok0J|M(($bL2*z2njam(WhT%;{yXop9C>I-B zz^N|@YN_n7XTWgmiC7t#QAM(t6VcHRKyT3LQaD4l|6^WDpKWw}z4E)!`BD%p6~)H6 z1}hVTa#ik12j)Kg>zIe;(%9D5Leadlm{>VnV%E zQJ>;E6tqf~D=7n@KtLGJtYLpz%S@L0dR$GH`Jg~Y%!Dp0h&Mz4^t?M?e@JLAct@;$pr^N=({s?<2-RZSGvGS(lfMkwC;0*MO|kMoaTxMMb)ug^ z7>qgUs_6px)R3*B_wd$UC|wTr^9E5*Z?0ZIU)+6BWVrT8l;chm8rfiXsm@#YW@rxh zf1Ms(^efnit%A#z_lR(rh`%dm4qmg)AlA#*Gsx{!B_q#`E5mw>G^dtytk{#zc5l{;gJF z*IW$Y6|ldqQ!{}w);dBSx@+HiL9ZE^fLhQCIvOr+Rx|FqAz&O2rs=5ETrzR9YFX-D z#J%xOYqV1*Y5ZCmrC<42L-P5&_Z+gWC1SrtFR%zhk1M`)kFMqw(^t9>tg8#R zp3yEi5=tC--n4C-sHwVgYfaYIdhS=n@Q+#7P&7N3^y#cioiARif$~OGQU>i??^+A* zOwYk5n)6pjz5O#o{|PKl&!65dTO=%mY<-&!ytt(UX}y@+sG7MruW;xg;_Zyx5@@j~ zD`;Vc-Jgx3f$dIok^}33>^u)CMSaB)(`sw(3H}fJzdXkTyK@G){V{7)2^yo|Rx%QV zZd=~D%E$_uyPu$?VScV$!xKh)?U=dPLT3&xeVJ=^VS}(vI*#QHzQw8CtAU&V(c%b% z|IMTX%vP^WX!LSPo*pHj9;JM=+u2>q@3zCiP?~MUa%;Q-n*KF%K&jl5>l_KS5oKjR z(y4D=W*gR!e+7h&Yf0-|ie704NISQQ(X8dn9=K^7%AQK)zIwBDlg-nfSTzHZtl_Mu zt^7PBP4Q8?17%7Umr1PbpmI?}enpNl?zj*kgg0JBWp{i~M}XUo;?@L2oL&8vH( zsVFhrNsZlFs6ne}!2m4Kh+#EALYMR<ja(CSl@CApc<>1^liHMN=L#`kR~n^;=1VU5dTLgQvC**JYF@C_w2w<;;f#1 z|6aq^P;cw5_tFR&`)hdjhS{~IDs$^+mI#RGHU=B|h~phR{5Yt$`J_$v+np0T~tO-^SJk^erWf3h-`-Ku34 z6@Qjr8*_!*$Z^yqwz%ch3j!5`YO=-c4=Uat`)e{ZcDOJ-OiU01NqzM1w9jXU zKFGaMDj#|}Ee6(qfj{Hm#Du zmNB9dO1g^E9omK&>nXV)^(E-kuECKK{cd)RO8`Hylgll z<1l2f4>@3_>5ohNY3Mnn=Q4@>WBBfH77^#6Q?Spa399}v9~~lvfXG5aj{@1s0Xcm& zm4%~+sxjB^cVm}6oF-|p8aBTsFJ0|sW7f#wWH@$EfFogUP^PMG`WcnWd)>RLOaz97 z)X@I&(9xXc4?$qD@r5g~EThJzlOGL0Zbp35SlHs`uMzfT1b^h=_R%b)zxZFwqyE_56+CBh-T&H7^`g;R}UQ9led zk6rsv1m_A)FZSKaWcIJ+>ak8+kEWYGfF7v1T=$s&a=*QUn8-R`f|9u{o$Q&}_=6+O zGm;bB$B2|)uX55y9a~!v(S>rNl|x+&deGZ&AS34{v{@=kpEx|fp~tu=IX*M6X@oM# zluOLu$7Enw05A?5JRUB=R}^v@-SOK$5K=r^7^KEu@%B`~1 z<=4*a)KLahBr7nhXU1jEduF%IYL%Uf9F6A)Mh4>zog6azgh(q%+$Pd%^^(W5?CnLxHFl6I< zpc5AV695$5oNo%5kovQj2?HK)OHa-UmL4ele6c`0N_6gKrYdV!Qi=0udXa5xul$PN zuG7E)-H$))OIy5p3^dUz&KzcXuN+V96C|56rJjJ^)9it*I`wwU+0k2Hlwd!5S_d=Q z^@;hhgwI>>*SXZ#)a`?2un*@`VR}|U$Y33wjepQ1gRDsw$|jzzfh;w;^k*vXmQh6Qx68)>`k1qWg4~T% zh=<^mPvj*Zbd=P^ldT^scXEjT@7~GT&=kc3*7JyXBSjVz%0g4R(n%KCdhC)Ek zf<_yP$~q!7EMPOOR$7CNkUNBA)dns~fi|%NscWXGQ{un`MkZDuH6?+r~5sR1n zBG~Djuy`^@vbXLT@PP15T4o|uWY_s9)LyWE+|d5V?jdYx75WG`vm`%5Y=n>HpSv(l z75v;lvvz{N4w-e7wfU!j0Ri=%hxrf#%;5 z^`0<+g^tD$IWSGD_ps$d9A1OI$H?CFc7+)M%mEd~1bI28GXq!!F0El-!qqll_EnVa zVmWKPZR6*^cQ!HHAkY^RL%1z-xO|$Xh*Fs<3<`aT1srMs;sokH`CAA70sSW_r5Dd; z0nT`LRIcV!+=m{1IXy7Ve8RjAV0Ywu9#gxD7`T?yvar$6*=a`PIzp$`im)=)V^F0! zC*ry^5>3uqsPu|I+ym1nZt&JosD-VCFayJU)|S9TfP1UyBXB7?}^#u_)*R+H=Jd^HUt^wp`X17ThBVdAF3Ye z;l){?#F59;2p;nnc}?6Ogu>;ZAB*BL9qRRN zA-(aD0h|S+IXKPfp)1aF3q&e3-lPX!Iv6ENb?Mof=HN#_+6a> zy;LvQqriwnvdVX?hULiSS4C!Ju@zPEf-A=*jJ#>Bta}|cLYac87;L4mAwC|}fFD84 z`N^d4C0GjF>La3I#*xET<+Ei}$7~VvU%DQof=8GM2JO`7&h#N@!F#bG6~o# zsF$bs;O+yYUC=@z0qy#2g4lw_gYeW>;GnqqO>PSOIEjT^h8BNgwVeuQ$vXPhS> zmhSs>GF*mbqGf@DoMzR*E$h7UBLK-_g$EJADrfwg&C>{Bh8Q|Knf`&9Vm0zjEYo+c zjvP)-2HJ&Ii>@s?M*R8<^b%q;<19lq#gRwbEaPOsC;}}=9wee%S#96eI+bs#uTK7Y zy-4lT^?@u$F!NMWm-Km`eK+ym_lO1?CQC%sp$LWBDB#V;r+^(d(By8eLZ*!7>E4&8 zI9YbwB`kE)pUb=!z>w%bqysQx15NGpitdw`wo(dY9p;$8b;4nJ+Jg6Vh_R0)C#DxxXx z?ap*JXpx+s(Z|{LdbH}~Y>6)UUauzH-It}vIE-(kSJk*qtbLRxdA=^X#56=U84Z8D zX#z`uiFT>RDp?h{#@^QYi7MH;vOF!O#g4L5XZMVh3-1ZHS8(bJStr{%s{ChUHr00= z^d4Qf7YNo{nD)A^5nQFP{TgOrj@HCB(qdP-3|!K`>{rUR9?+3Wx*fFQG~ddM`r1%= zmE&*O4VY^uTOn1No!79{u8pE%y+(e3dx9r7jZ_{))kva3%5{hDeWpsTN+@*&k)K3U zLucE;paw2$rys60?=^r4o2@~%b6o?gO*iZtIJ^e1Uiw$Zx9otb$(w0z(5WjF>3vj~ zi=iV%iFu**W?&4_p7NI?w*ra#@+Lu{sXwU^Qy(>F#*!AL{$90;xZgq>^aZSg!JS%G zX^wy;fH-R}sCIa9-n+rJh%$|nC?AmMmd)xxB;3WjfPz~5qCO0H!w04;IxeFDEEkI) z4lc>(FlE9%5M_VDEI6Op44k+O5@rSf4vc-?Xf`DG=p!u}`Bg2ZzwB*fWq1^$ZG!5F zqc1s<&W|A+&}Wl0LaUTJI#%{-t@@K@o_u*{8MCkTb5{w0T1}a_zgJ;gOpUWrIiWER|+mVh8G( z?q7@cP=FWU=6MSKyqNn~OfQ*n`MnNaByMtih5xIg1Ds{00OX#s*tud8Pc@q^*4X0u z`8eCs_unEW&}p^4vFGYWK0;eGBde@Tz-Q_#kb2GEyZFZ0&(M21}w~9+$+j zfG8bhSSX6 z9r=LcIfWzX++b`pd_)7b_I8>zCkR{)rFLl`^W^RXL){Uj_5|jtsW<7MIBf)aM@$~@ z-^9Az-y%@3>t;TAG2C4GVM}+9W z`o_R!D_p1LLlo=X&6E$J&$V=^@Y+2ttl%b$`;F+;2Uw=%BA`X&>euBh`gkx{;zz&u z%O$fK!Uib237$!c&Y21Bc}X0?48*pOEl7ucFfjMmiJ+2Re;#O2wQD%(u9ax^Qs$$~U+ZGpyH61sj7!7 z3jyA(>In8rOY0ixM4buW7zd0SXe`{wWtb~$G$jn0x_@SkJ|~`3Y(EBVXzW>IM|H5}A410)w%?1>cwvzZlch;YN5*|2qqb$C65`f>JT-6Vj;SQiQ%`Rb zm4S2;Dj73-M2iKy#C>sm)5#_8kE;sQ$@NsYYy2)esemwEW^sa<%KbfBEvvJSav3bjgbCKY1c|?xjuBl1o~BYNhBRk0rIvbtOaa74 z@;^#9zsh-hFE8I)Bmo^pH#6Z7-UqjX{=YgMS9&f5LW27pASg#N0G!}&oY01^kh4K@OjFh0Y=6F0(AwT01 z>h)DvEJwHZB5s<5sEg({DvRcIQV7GD)yVEte{>Wdwvb_3wmvyRg+`;8s2pDk{tbRa zcz^42(~^8$c*y)xIkJRE))fa-RM2>MFd8ics3uy@BKoaK`mK5VU6Vq+*Vxq->wZwNF27k*|_fTa)1!s)WW;a~eZkvh}J<|v;k z-CDojQxcgQSP|7^u>0AAmDl@3T(g#PZb7gj0DkSF_tC|cBpQ6-O`wWH^k3ADnB=se z6QLbQb4Y2_O?O;@!V=*zzRCKZvIGXEHec*+6$qLrywBKM6~fN;c;vmDeil(I5FAxs zW`wgUO(IfbDM3au8U8^ITTd>|I|%Sgw*JdG0?21AId+3vNqS>0;v@?n1v^sL4c=XX z9v-he0!)N=fUYeEKFp7#K&&-bp-12kSI7MS892!G#*hMvy;N-@=QP&3 zHqfQNk|oN(YViOjq9_!QUn^E9)DAmqnu)kDP+*#|2s{agb)c)dB?PMsyyz#)rC5iD zRKu_zwA#$Nf!$QMa4V8pzJyp`a#&Igs4=R~aaX`{ABU58tVyY33u%t2^$Y9LwX*~{ zDy#oTy%W{4ilqt(K-)j09h}Jw!rF)W$BMV`TyFF0h}Vc+>sGtRdPsfi9>30E#Ssr` zx~}shG*Uo_X2Fg$rjY9a5HLvv#kc21qKo1M>EGVc zSMNta+Pjzv{X1N$AZ5u?<2^up*q?K+{V)>G4#Us4G%S4SQ9*_$cd08W)STRS7!mL! zI(H&sdZF#bVs*%f`r-U|zpeDqufTjL1PLkC5d|^#Oft=c=*NHSsRN~PNa+V1!-ZAx z_F?1R+LszATCUokUg+90a%e4up>@ftkFe$IIGnc%b-sq}ei8hxh`7W}Q|X@p!|?p| z8#=?$9TBBM;#)gLF&9`OGPhQB#7;9E6ZE5^d(GU4)*8o}3lQq#0yN&xdfiLy!uTBFho%^W#Pbz{4K#K#1E!w9ppdsTuHfFU{`W*;3 z6a_?lnlnLNla+eE?z)rd)HytKe!ysj(?yj=%AfxGWS=B`c#id?4sqn|ahe&><2I;U zbK#snF1=Fqi{IqWOCst=;IjQ6PcV(zH=!-;@-imeBo+JeY^l5~6(4yi)7Y9TMF^Y? zfjiYgtlWAdEIDqbU?HEvu^)0{rM|th8ifSZ!R|T%FAar@bO3|BzSCs%i4R0Og(Kq^ zrAxO=tGv&QsJLWO5~;xR{yr$vls-PcOM#3m3;+J98uOTB*6(1+y`gy zK?ipT9$bTKa1HM61b24=!QEX3f=h4@F2U_hcHh3*s@Q-#&A?Z`bMi>U=$Q z`<~eP3#4&l<5_D&VmF$~*Q>MJMFNa>;3D>x>YQjd`8A&;EwtBEPJ5Bw0-YDjYU%+? z#q4z5C?|zX*_J|_0s_BTA)($M^pkXYt_4N;(^!MtxFn_Md}Ud~d7G-KAWOr@;rzxC zC3Eya`Fp%&$X$B+rN-BSK+Gb>3hVqe7GNgd4c2eWtIe=id1_)|fw{V%*H# znf%U<41uRqO$qhoI57kEwUj7SQ;s*YAmPx>+7pyVP=9+gxUkoduPlWaGfN|q?kyx!btzel`S9fW zkBer>a~l~6{$QL6kh0juO7L543b?pNF_Z?!uISoaY6yh{PoJLa=vG)bprF2Iv;O5# z=f@w94i2xcRc2*-7raZ%!%Eq1o*VCa?)$sAYO2*N$g};J(%n~o58XueO$v;f);qAE z;Io)|yxY|d5>D4>=`#8v*miH%nL|mz7}rX+u}Ix^u{W_=lZlj6O>D!GfJP|Bx0BI1B+XjC=)aG%3cnS`_V-~t~Z6hrH|&p7t0H2JSxMrz;HuJ8obC` zEyMRZ?P}axST})$9`SFvKCHt9nj*CIiS6gw54uedW}x9q5g8N@0#i+8yYkLqvRZIs zE6Eu`EmV0Yn4*{>L*FLlAt~=3<1Yz26$?r0^Az!5AAAuGlD&VaS*O|`Q1HXVPiwzh zI^uGT7Ft1aeqL|+%@+2xf~VkOxcR(bd3~mjgQ`cvf5TIbi5*IAti$>P9M_{#R9V)|GCet~4zy-f99*RmfukpxADgWk&?Pd=GkZwGgeL7A>9&p)WQn-kym$K)dg>X}!>#(88|Y*-k1DYMvt zO8g|oc#L?Wnc8%KPwYpUwlh+5zZEl9DHW9hiw%mHS3esEytPUi`?z#>#3%>&7aRSg=b#E3k;m_s@+jT9DfK20dSPeY9P5oTa=P%&l8+qvyDnruV0T!=3u50);4*29lxznP)_wb3wTNg zXe!oc{7L|_qawEE+(pbYGtS;BpgRO8Y!v4cRz04QY&36k?U~Di;ih}PN4`?OBlwv8 zOBSi)RMSHM(^LMTa+bpW9i+aKveW!=)S)lgw(|zoj*7$J+;4j4lFrU0!)d4;V-B+G zH$J+}D)Y-o@(eCI?Ja?<%ymHel1}cADHpVfVcJ1HYQ|Wx122Jg#9|$XIg&p4k=^?= z5r%u^)SR-y3%{3}i(Cq^NTPu~$<%_>KV|PPZw=vG!$Ua6sCTV6Ui& zfxWJCxT_o6i%D{E^l8C1*BmR>r*huQ`9^CyIA2}jXNsBJX)AL~)0&){Nmd6SoVB4+ zxZAGym-avstExnyLzUCa!;n`>!9v5EC>wR56&y`Kl8l)~WW$2+8_L2C3 z4L4a8T)`R37{~1y|05>ke&2>L*I=|JDm5DT`q@}VBj1>}PMa}H0IR6i{u%rC5Ykks zJs`!QWlXHy0$H2}dt5-zr3{0!YxTIQUW|!yoh0crSTVU`uU1^>NF(~B9j2PD_5p2w znJU=X?|JA7K~D@jkRZRsbM^;AfBQ#T0 zZ}2Z#BFCFmu8}0pf1>%~kFQyCJio5;A0e!~OP5w962m!ZosWV^&;6Cben!{V`-{H= zbdtijKSOvA9JWZsi(Od~gEV;#s`>RJSXhaxb%Jm2xgv<<=>}%WrG^IPcBrd=Zk!`5 z4Uepk;!<>uw0&e_EMr8meBboxTM;Tz1Sr{Vc&>(T9cZbP#}Ns;4jAu0$wm7pRr!Ye zx6}z|VUZ%YKB;m!5d-iY`2jo(u_)RbyGe^vaXT81_-ONV=S`bD?ZAu-!_|lG)DVFs zd}>A~6x}%9SNyd(d+|z=FeK-8CEY{!y*U&5Q74_3iS?v!N>fz6(xpgp^UYJaC!yvW z#E!Xe`8x@7fh6dUg~zdYCr`?!*~J*%{FciIGJ-}P6RDHR5WmuxKfMaNcMLm9^gVp%E`CvDAf>*cZ_iA26zd|~? z6FiabSC_hR#as6QLB4QlOHl3E0FwR5;=E%eo8kegVfxWh8U^gm>c#nc@t69&DKVNa zgdfc6O_sRMfli8HJ{3Nv_7xiGiJ`z>+}gsujt zQ&(=+1@F7qAgZ7Fz`#qcY2l+G^hp$6m;0)fsE4?LdZLT>MvAiJd*4GUT zcIw=^!4E?H{$Q}j6^Ruw(WYM_q>#L+%%B2HG=){cc@rwMIzZQwp5BxywqrdWbQZmz z-DMi7WRi9@zWDT{ES;0v(@D=QrwOFa#!NpQq*Ui{4{nks3EcO;LQlL4qeIkX8PO>b zsK3yaas3wlJufhueWFdXs4~`wdn~eO)m)?;$ocK%;zTKd*W){%Pho!Jiq1gTnN8Lf z&SHD^yD|LW>3kN-h!_F5q(jk*j@|LBZ_QuS&A`7UvIIvUs??lb zII95}3_&%@zQ!Xidfuv+i1ru}Mscn+yadM0tQSDXoP;+y3!-eB_imCh=do{R7bb{R zPTMx4H9ZUZ>)wfJ=A)6`6-6~>J}t!y;pjnRTLRV=e12V1PuDyZKknXWW#;xM?46p~ zH}BxT30VWvmc9)7h4z)*5*7(M;e6)<=0Wyqgnah1bm6~n>$j3$$HyYI1VYQj(gDn# zi&bg{l#p49VaA~KT#E5t*sR(!Lmym<)W8n;&%fIUAyud!*Gi?tF4DD ze(sE0gQKHo7pSnS3^%Dh9-^NC8dzHc;5KM}aBC9YJ$q-iMsGgrv>ks)WqH`y!b4l$ z_y$|H=nW1^ru<4md=}@xTXprA!&)96*3xtzY)$5%O6)KXpo2YfMUjFbY8G)+V-nMa z^e+y=9O<#q^;QRd?*Ax+eelSTR$qc^^rsiQ9VIhO(Kh7pyS{xT>LLvr^Z5O`Cj^nU zYVPs()}0kg-o(T^Frs4Hb8qJzCXKmnmV?ktG;%OwJlCnXt1*pEgVd_m*FA}N71a06 z7G0K65iXmj{6shHXzz;y0##e;eDnLvjWA!W^VGWysY>-b^Cj2TSH&h6a3`mD?T1T4 zGK`(`K#La$&E|t3t4xb1gIspgD%Nkn-AjnLE9Sx$m0R2IsRdvB^Y>%1K1lSRC_9Jz z0WCykbPIeV${b*9G6}NL7BdD-PkE7Qm(=&$=iMq#Ef3xzKhNe6P^4K@2VVRP7OIn3`FChmX9?KO&s z$OTo(poxyDBPeI$>|r2z_s$AN$$JLU*nHlPR@!!aA7e zj->jC*~qr2y$0V{4!(U>4eDW-h2?|;TG?DG%S)`@Yj=(+mZ5w{A8L$jel47G8$yeO z&dObgP+Q%R|9)ELa!~`3W<~wn9j2(ziL~U8nby;m(Qb8G$HM0azlGLjTnH(rM`Ga6 zXhjMsTi{B^^_@T%1>>EP==S4$%xpBTYZ2!b?>=r7+ZXBK(p2T`_hdmb6Uq_IOL7$WNDZrx*=T%?=kRX3$47aXtIpW7_R_0 z6fU;tPZ=DhcRQT5GvtPajiNrJc+k?88>pnmeSaDIiaxg{^xujIbNxpVVE~wu2k_4Y znMKml+S$|*z#?gF=xi!(YHVj>iYzFI?BwidYG{M(wrry&Z@0yY(Xp@LhMu(ic0gfR zBRDQI=p(VKPSTeKjgNA8NFT6;mtP*QGt%nY)X@E*gIF#y@P<0?Z|bub8`oQPu+?9w4!5ES-M)!MdM1M>JWhGu-O$pyj}k5y2q=kc+%(6 zi(u{J?q^nSVr}l@HVv0Y>5Lp5eZ`W%K(wO30nrBAeOV85AL|$e`7Yb5tUs5oj`W4l z*^TAbTdyo?%kp{ML)}qL%AN=wC zgWkrY2f^dReiyj$AKz}?6W)7^gb!4kN5bd9caQOt?xCxZ8Q|nc+rz=fDN!|GEJKp_ zr^e@un9o!$u!v4<-GNa)M24-}1TB+2j++~=L(U)oTz3b$%J-W_@KJ*&9WXy=+G2Osz98t(xghA&x=1^Wocw7AH8b{`SL%dKZs2XP-v!^H3IyND zwLq`O(r>;MIoUsXX98d}$MgX9`~HDe>|G2EzEkA9*QrrW=t>AU413*BZC6JnqmLQF zBqf$-lS;2;lIX26w$2@nWQB|^Hf!BGd7@TTmFKk(>-u(nt#Ucx zfQ~+h-TuknR@chRJy(1-!JXayJL9>?1?$LDd2_AxB_Y6N_B!%WzzJgQiywVsXYl?~ z_UoGGbko%+!rVe`exbfCzl)e}$@usAWrTTe9W;b`2jo2p2?(u2Ta}&ERDoU|Gi%S-X0+wo1eQ=-2C> zTtBe7ExheU_w%DhX@M6Gi5G~6y#TP3MWej1 zln9iQn5;m2YW1?_kfEO1Hw9aX$Wfe}dB7j=1(H+gDqpa_$1A503(I^a@8;x}d{oJ~ zH#%&)uOk_tU_lFRAlGH|n`%ZIdk3b_?RnP$h#6qx-E)J>WNqvQ?iY6x429=7*}k{t zK`U|TIr>nt;|8l+vruYDF(I+?BP76HhFG2~%sYA`eDeX51HUSV%8iv-Xy{K-#UyA> z>$NWje7%70M&6DBost7?-mqJq!}w3asI8wAfxAt611ZtqIWG!giN>t`~vF z?Y!UVSa-aC3eBlIpIro7UA>&ET(5V}&H>qvRm-hPD`pv9GhEwQw06sf_EoFF*S_Ce z&g4Cda$}plUAoM-bIn4Hs|(32DMWm21_MjH`!ECc4Tqg&=Z ziqRa$7^f96@`-P@TW-x!4d=E%O~iFkiJplXJUsX;a(QEnzv?h^+Gf!3;R^573I7p& z+8u8^=VAF;{*l?XE80jmqzb zU$`?}la9Wgw4tagR}GXiF8Es=vAh~hxcyR`#h7Ki2uvhJV5mD^WPHvok_60=mHIesFyux^Gb<3Au{>ptL3b1RYoY84pg$O-c26={ms`NLinJs4bcwKCAUk zuERMSmPIoRR?m?iAqKys8?-;z9kSiO?@~f-KYH)YB)zRd*{+T_HiD(>%M^P zpN&C2BzWgv(%2=RO?y1B?MJ|Wzz;)LVPK^oUeJA|(Vg+m9CtyL525$<$~nPDheSGB zquAWO{-FCP0zti>Hj9)p{0`qB^2J!0EJ|asM-uQIog@H7(?YGoxO>fs;MRuh?#?xnT->O%p&6AY+>g}2V_yTbhb8y z?p0ijzL*+2LtA1{sp$;;O_!aU3%YUZva+(u{RP9np!yd~|AOXU&}HR-Zmg`Xf5GxE z{KM;CX8ZqVvHjaJ`3rV`!5GLQYwG^h&e6n)4thKP=+n~9RvfB&0d(TLY^-dYtRN6O z)Dy(P&BO|(Wo4y>`pMgw{7){bj)wO3rY2AcZD{Reip=s!RYH?l(#6`^$k5jIZ!A3bdROc$;-*HH zhPFU}mA#>}g_Efv!1e#0&&ABn3=J(*t()4K*cp96X8X5Cq3UjL3Sdz*`l8}&gUq4; zVCVj;^&Op@0c>2*rkts*xw8cT#0hU{aKn$hc{k64ug*@c)=ck%Eh_ibEek{m0f-XunvI?a6A5;yfj z%&uHoru2wv)sK<-MmfS_0pVljNG&tDu4H~G{5Or; zzHwV0%V`PRcFD8S)F#4QA&^EM!nn!%R9RYaZ~bX@oT^tE#Q69)N79B30BdeI%^mr9 z!wa(I7<}JJ>SWP>4XB{MLYw-)B}PMYb!H!l#JQamxzi8(6VR{0U!J+i`-uo2IL0k3ESi_RwSzq9wlYH^LN1Db z*!6}{Z)AsflojM|-VjJU3^eL_oYnn2(j_D=91ub?-oq`xKrGFTJJdCE*6#5uAmM)d z#qcdDhtl-NN~UG2D26ZT5_2fxh+1ngL5qu#2OK3GDK8bA1TQ^^QtS=6nikmp*2?jR z9LM$|-IV4y&v7z*r7E7|OhJ3Nr@* zv_wVuba%UqI7Z_2*xPhj7&4uM`qD2}fegV~Quz5a&ax0oxkTMZ+?w8EZ)54maALFI z2v73g6DiXkR*EQ}HA5u?Qj=zi(~D4!y z+rdaJWcnFZ7cnks!k?Hr=9IW+Pe*GERHLznN z#FKNrb5~jM5%su4d!1&>D+X*A3l&s2hIVc4w@;$@^bI{FYj_P>t(`xvn#?F6@o0bV zauIU6o`in8+0LiiO5bWeU?F%qnSG9{=%uj>=9K)_^aY|*b6TP4PM>W*^WC#-w>8Dn zD~Ie{%+0fZ{nFT{SLj^D2iLo6OT*12A_V;|mrw&Ms;S9aQ2bNXy~{}T6#hBp8`aA$ z<2V{GWd-E^@?P++$|2(5fJ=fI`}AbVKxJjfQ;z!_c3*|ViXP(!imK^EBfXTbVV+Lx z49IIwrFWUW`$BiC>NMkha_woT<%T>2E)1ksZ|A=C z`s>wPho|xLDoD~e0foUaJ8u-Mkrkq-s+O*Ywvp+li!GF0JIh9z zRM(o99+~8}VuyDy&iCZRV{QB8FrCubkgj1Y-d@Q&GclKZ==5UdCNpa?vxQ0Jq_J#T zz_k3XrR?REQ_XHsrea>*E8TkOkZh5VbXg|n4W~ECcv->OxzMQLYGa8`@TK8el+wJ5 z6`FUJ_`M0lu_mr=gOP|>D`|S<_P~mM0#_e{Y2kO}U5sM!d#JqEa~zf8Sy%C`IHt@r z>H+q_mrN`ewSG#^wAigzaGn#T_sI$_9pkj?$9RX1j2y`7uP%(qUF7o1Elkwuo$WLj5bPqOrj%eq4-xiLvMFTeHLJNm0(ieNyKN%fU= zK+2X1Q|g~N!=1W$0eYAs)I#^26aItk>vG8~Ro_U(5z}afS$*#Z8cC;lFY19bQ$*+p2{xIyfaJmPF1Nlq}E z1iOTU7?_)tM^ZwZ6D%SK_@7;%z5FZLv4Q^A5*-J!gbAyJ9khAMcVjcyNdymD`XP#u z3h)j%;!lrW?uLc<>V5a%EJPyRl?(8-HDM)GC-f8!_%NE98$e%NnioT`XF(x_VpI83 zB!ZePYH9#c;7?ZOcD29af?H^((9XVu&&jr)LG*iM>zb=afN$)>+cA;%|K(l7$=T4+ W+079ePEIh0ivyXOT0&70`Tqa~eF6#q literal 0 HcmV?d00001 diff --git a/examples/notebooks/intro/input/solar-system/earth.md b/examples/notebooks/pdf-processing-1/input/earth.md similarity index 100% rename from examples/notebooks/intro/input/solar-system/earth.md rename to examples/notebooks/pdf-processing-1/input/earth.md diff --git a/examples/notebooks/intro/input/solar-system/earth.pdf b/examples/notebooks/pdf-processing-1/input/earth.pdf similarity index 99% rename from examples/notebooks/intro/input/solar-system/earth.pdf rename to examples/notebooks/pdf-processing-1/input/earth.pdf index b6bc7edc817a7042e31512a046392de68c53ce69..9a775a9984c7570f96ba5326cf63b6eb5039e76a 100644 GIT binary patch delta 136 zcmZ2}l6m<_<_*7Z8=4vz8X6lI7#NzE>l#?78yKi-a_Rf#r?@1Rq$+5*SQ!}@p(@#| zb0>-2+0EF|z{uRt*v-<#$k5Hy!pPOw)z#U;+`!V!)y34p(N4jJkdnz4A729i480`s delta 136 zcmZ2}l6m<_<_*7Z8=4qc8kiXxnOd3{=o*--8yKi-a_Rf#r?@1Rq$+5*SQ!}@p(@#| zb0>-2+1Sm{$jr^j!o}Ft(Z$Ws!qv&qz`)SR&Be*cz{JhP)lR{Nkdnz4A729iC!8f% diff --git a/examples/notebooks/pdf-processing-1/input/earth2.md b/examples/notebooks/pdf-processing-1/input/earth2.md new file mode 100644 index 000000000..04f4eb6c3 --- /dev/null +++ b/examples/notebooks/pdf-processing-1/input/earth2.md @@ -0,0 +1,18 @@ +# Earth + + +## Solar System + +Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun. + +For more details about the Solar system see Chapter 1. + +## Earth + +Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life. + +Basic facts about Earth: + +- Distance from the Sun: Average of 149.6 million kilometers (93 million miles) +- Rotation Period: 24 hours (one day) +- Moons: One moon, called Luna or simply "the Moon". \ No newline at end of file diff --git a/examples/notebooks/pdf-processing-1/input/earth2.pdf b/examples/notebooks/pdf-processing-1/input/earth2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5c024886afa100e99e356591d7f60a04858f34dd GIT binary patch literal 58532 zcmagELy#~G%%$75ZQHhO+qP}nwr$(C?e4d2oAcc}^RH$xXOTMDWcO4hsY9wDB1X$d z#|lNdurj&<#Y(_HU~gmv#lr(dFJo$F?qWf}&cRIZzXe4vW@+PM>O?>~wu%ipH83!+utNc!ND*7smfDCb z^~XzHn?q)3(BTlNUTv?w&o>;^9fWF6zp?pd<@s=^RT)H0rAiugQg)!U)MXVVwXPze zOS3+z{x}A=-1gn(6K^3Gy9PNM6`IiU56O|euvNG-p_Pn1{^*#iB~jL}qEaMProwU{ z4)yzjwvICSpcoiG|8)`A++=31C}J+@T3SvFh1Sn6rHjZdr>O~i0P7?cSG})2`R-2P zpWm$wf7jWAN}}qyTQEN0AIQD?wh{{o_jYTW`*OxZyG#{+DZ|KU2b5eUM|_Cn)Fl|P zCVxs1ncGG{E6aFa3W;yew!0m~7vf#F0-XT*CngZ#V5|PTdBK;7KZG(PQdupwf==h! zD2VJ{Tg6A5-tyg6cOuan@!=B6mcvx2z#jb>h=k~O%EsVPo&4o9LXNgWC=yTHhGWym zBcYHkTZ?V<8z#l%Y@aw|3@qf=Mat+>P=GYksxDQw}Ra0aW^l5Hx78O)&m5Xco1Vz`%&+$z40gsGPP7?`q1Md#nEFr7&Z#q(3mmc{G)ey5*^Rs#cb zqdar4U7zP~OR!E}2fX+B`rfH$MtkU;8v`aM+8`Y8lcU~L!3k+NtE~L{pniFg)G$&q z)-MEg+=?8qe{;_}sLR$4AC#$`$^RkaKllGp#mw}7;l)V6$j-t3Kd>Kg6-w!Ea$Xb?SqDsBc}Q zfSy*&g+b7Fc1fgX`tqOf6nsg1xAi~5SHh61{VcaXDDVz|z`yd?)W~W6&c~^#M*weR zYjE{~;K<-j@q+5e>TKwNcl_;fY>VFyeULx^Mt1YFeqNN{?UQeQ+u!Uiiu0qn=Nq4I z8J=I~Q5l>aot{70>_6{qGXijQYjpK=e^n45nwpyc|IO}iU1sZWe;X*GD5fMNt4o+i z{NS^J0)S72@xaB^<(K^P{uhyv6HmaMnH&H#JU4)+@dPQ3jOf7Y8-9&$g6~xMoljwM z^@L{hWq+C0THuM{!886^v^BU>G=EigcW09&hXyw%;Hii|_3LoRfBv><0_p(F0Oue8 zURj!qKejLPCEu*g->m);2WLkRP9Pc?8=U|@HM9Zv=pgv$$nFFKtDCUrXHR!=f6)jU z9RR0mW_9?&zm19rzm;@H27o~K-_zfN$Njp0t=}tuHcDCXxK?mZ4NpKBz_bV%D1vqP z!va5l^LStJdP#O}Zc${f0+)Z(=l)`u8`|5RKl>ej5Q{*+rBXkC5^ZjYEw3P1)Y{!y znm^L3{+p&cvV38BBZEtD?~O!%87#lI#NF|MKO8PKe|ee!U}$u4fA_vKMVYn03!|U| zlOOTW?}dZ@_df`~^x6Q)H9e1ndEmxBCa;x?g{xH29qb{=xX2bAMo*05Wv{RX~}#KWI&U zr{3SdKXts5{<9%|)|y#ldrH=2_EV}MZ?a#gUyunTB4p337%V|esA<4dUK5t_ zCeKhV(`#o<+(_IJfY?Z6#=N?{agDV~_m(Rbrk|;}7a)uHB+xWA8FOxzc~g~dyP%Uy zJBRf*@tJpwJ3N8`<3FJvhH7mbz_|G4^x0_5Nh!Ui?`cbC4 zJ9_gj8&Z~`Dod5Jg&@_8EjiVBZOz*yx38hMmNlfZY;ybR3DxL zlI2Zs;v;yyjkux!NTsL)Ra`ySew{@vom&1?djVnyI^K1iy|Oe15)VsmHUIVeaQxRG zhR+`{N72a6P$x_`unOt2#~)my!!cU7X7yQmTYF~=_{;?4FENHn%0gpe+i=sj4vOSL z8%4D%VYU$WHuBwRL1wcF0Uksd`3EOz{6iFHJ0m^h0Y=FjoEt&bLC(eLKF-rdfnq-t zNzb$ax4=!t?~ausj4k&e=Sge9p8Yw1DAid+{8bCo-JSZzkYWn(L4~iIP;y_$uiR(W z*h5`wdI=VI=D_%E{-^BnJF5pu{26>oy=F2Y9PWv5#G1E4kr8$EO%P^B#D=vlEgkte;0~Qe(G|faX$tq;y__=enGKI-d5OS zVIU$jpL_~sjuekm^k_GazL^_h%&dxw1+?9qbaR)~U<;1CFFE628pd^!sH6+zPUJ(8 z{vnjbF1Re?az#Bs$Nk|Co$wdYynAiUm7NL0&yIQh^3hV^*tHYT#Cqefr#VHLM2+JO zdsSN0t-4z_Lt8ZjX@;7);Q%0clGY2&lRym1CWj@2(6EwZESPZQ^p?#{>OlOEsbA?#ki`p-LqEO)AV|X~XA>mXqW1!}ZZL zgUVqJw8)GQ6t?$7f2X{537mzP;uA7&YAVl`0q1el-c!_pvUU_Zo}&rGWE+ml-?GCx z^GH6!TKxgLXUR~k*lT}7x~5LMXM%<$<$M2G2K3ytrHC(IgT^SAz}xIE*B_4*`~6$H zLc`XE*MJ8V$X7rno0U${=1G}RT)AiFMT>O{&6^qTMNAxdV3lPac{u*cr|-Ayp(R)e zKx`6ixv}hHBtJhOx>uf5xnVc8y2L{WJD;51tX=-x)@udVA=27^ut{+q6uFgA70#JL z5NbdBrIv`~`|2_tUtoej3;J4i-^qS2M6LS-uR{!ynk0GZNP_Se2>EaAl~vDt7J(4W;h%64o)(!2 zF+-0+3dJpEx;S*b5=KHO8htF4nTzzb1sQk56k0oTO=n5^tiIUzvJcbmS=U zJf0jCv1ckaKAs1v8nupze~7WciFG^J?+Hnv=b_$**X{ZE-8V-gmFV+9Q$^0^13GFU z+wivD`6_xzK9xK|2Z|v6o8~hd@#P$cx+lHndd7FlKG>IcsM##c~HXS7LwC z@Qu;;DC_HQ_3JI4VB8qZrc17)kG@}5tx zj3|R-!++g?8_#lKZg}zXCfnlVxnp3vBYke&LUDoRu|9FD*OA?ALpSIrl@>0a2{npz zNvu_@y%Uw2C%Y9rls9ot>C2UZsfiOgh_*qG=XozisDXqVK821Y_P%4=W3V*Rsd{?Uu@VlNH$*DMDrxZLA_7?}&86QaI|_<@w+y z8+wdbhKm`6RDGN-%Lg~P>f+VhHx(#}sx6H#;p8p5M#<;g>7#pQkro&;(t~Dc7ZOrK zyrmKAzWrf@@m?+KZzIqEYs~i_ay@zxF$;iXjDrHtC>_Bm%q3kyLS#@rZHkz zHn(=1s|Vtad$xgpkAqHN+#N~ zCz;I2UUWs%vkEB|7ZNZDB1_&|rmQ+0p9QYgg}St|rj!K(-IX;5L93Dr>}`{bd;RQ= zMHng1uHLL1SD{P0V2r?K?ND-QVZ?g0wmRA6`*7b+phfwdOnT>I4)5a)VXa0z|4nWvRd%j`|DuR4*?(Q8Nl@|Q z57`_O8Cy~8X{YTB+%j5yyPOU`d7d~ftU@(Hw6Q0hLMyGtSD)E;t6hC^bvY5*k##3i+ z_N`GYZ4ht_T!)H!@57}kPVC%o9-hP9Jzz-&C6xGXpkmGKTgZebJ{v;5b>HFfS1aXH z6zY7FMZ}{bOKza;0cC5me7f@b+tq@t{&7T^>xlFcZoy~yK}oNbB#({aOu8CeABmNN zq=FD@KzoolmZ3sX<#j9hRZMc$eFQ4Bi}u5m2y?$U(!eqe*@{?b!KB(!lZrxFbH)G0 z?_(L)%p&UT39Nu&qE&sNS^z4_^6G3bF2aKM_7UgarKV+p#5wsD858*o=N-+dcXiY@ zb{QRSvPt7$crISGsH%ciCDMF1K}WM$#U~AL_YYbk4-w z=mq)<<(pE$vnFSIO9x!Qx0f&0R$V``zD2(yGYM$%GBvqPDfwcOQYpoHuDOUo$h?To zX|eo%nXwohi9>6#WAZ0u`zM&7<+mLy&|YK%)Sg*+DmgAZjyz2@^%G2X?8?)VEQCbT z_*{YSOj1P-4-Zq>Ubmfl`o6kOt-+Vio`LL=UljI)Ex%b=cU3+)-46EPoBA&tHY6_W zdd71EdPCZcGn`;AEc9D!4+FASKV1prraNBzW18jN#f7#mj860Mde^gM&<|tM{5=27 z)@hc3OV0DbWMc=<)4>5@f8iT1j`8!{**yD%GyX0gzLl$7m zj0SgKoP^3`ZA0lV1o>qT{|oT9uE21yd+a&Rw%E&Y=~t6E+5`gY=Y-LNrP6sbXT9c? z|BMvx)$E`aUO$%8MVDRy)dju;(fT zG|AS!$674=+is5{4!eP(6{Y3Kb@A%u48V4{rO)8GBU6qNkD+{2X^A_7#?1S$g-(Nn z1$X+QlY{j=TrvLbdRGkDCi1ZPzZ1cmh|uqg3jK=kgI|kq#`Z5n z>8er1NXP#D%S6G?p*zXaG=;R0tZTzJ;Rkc6%Ql*<*FGVmXu(dj1l0u6aVwmz$Q9b- zZtXAE_pBsxM{~?>2>lR#P#`HeI-Rt*b9nn}0(P&tewzITqc4?*4vKO##OFt3LO}$R zzWPQvmUZuQD!x!dd>5oy_ZFktyK@>qiU&UreZU`p#7^;ja#q^I-Rtl^S>!NxGeiEf zpL4sMbxcH?%lv%~Yz86CnGcuBAw;QPC~i6+JU8Y=TgHSij=<}T6KNAs{7!7bR5LlV zeMli^ub%TqaraEj+}k52lfx!6tXF^tm#&4^O~>T-kcDbVadV#IvZp6Nr-UyEJ$-Js zx7YC>g9W&R#5+252fB)Q;78e3#3!hT@?nr?u3M?$`BDg<^1DJi#kXectm(LkNW{+B zr~?!yeot;jI=*?SRdsMn2{Gv$ID5`^daSn{>2{@_f8S$!Foy@b+JigETE9s`v7)#m zeL7al>IXk=M@0>{#Dqdm*1&K~K$-79PAn^{F^3J6^I1A@)5u7VPH3dlR6h8F15fJ1 zi182YL}e~XFA+}!jv8PrNnCtc;9TKcOvNZ3>X>hrR+3L?_M>e=eaKSXfKjPp^{;0{ z+pI)u6eflQ$H5;U>FJtt_#x<-nA2$I!TUm+(K2QN157}(!fA&ZRAtuTf@o)*t+OWw zY|GMF&2kt#hOg_4>C6h7@;z)Q( zWdl8$UXkKD?*2Z;n^Oxj`#Yl9XFhK;4$1dil`}w6Kzd$B2CDydWu!5voz7cQcpurH zeAyKpcVb|Ivpl)uzU(?i8mis56;~&OY$$=-quuOuxaTDlE!rc>S}CU};^)mtlpmED zG2#wf1biI=RjY~}$9*@wAF=hduQS-Ht`#G|AR`qYEPR>fc4P;n}yK|Rq`^+p9pmH=g@$cKS2IesQk_A^c<&j~)1 z`P4Cg5>bSi^!y}T&5m~p%q?KnD3#>ZmG>@_7t~y110NlVW@VP_yf=upQ7A&$VOm#U zXZWp{2~2h$iheC&WZNAnnEH+fFH(J0ea_AHQ`>axwD{61Ej%@fjCNV}>Zd7|ah|jv z6*hG8VU>Nm9&_?Nt)wo_TrT|BA7hZ|gpcsUjq9!Y{9;RiSCfo^$D6SAL{_gR`umWofg*z5le43penkdVVpcvw9JW(9uH0LR!n8;s?w749=9 zAfd3^P3e%22ME2PAkZV=7iTw74HW$duGUP+1l77e_)(kWB5{^2LWp_1j5!=Kbjpg`I}Iu zRO1~#D4mGpX-IK_TZXGE?&iOMOT6c?-fym|i6s_!LXiL?H1!ccO6lrp9DS6~O(B^~ zUFnj#7vV3zpjt9eKBY_YAW3!Jr*rnGBJAGKLS&IV-A(P!^|XcFn=0 z$m`?GHsbQrRYGmq7-0LQCGfWE!t8C0@pQ&Dn80ea=le*9HYYjfl$kA$X!skY@8fCU zKLU|YLpY8##!e*CiVc)<2f*Yt$i!n&ttu%*TpQRN>p%kl*hDn^g`XmWpq{R zj(^Ps8?dqUs97qB!Vx9DWrF5R5T}^KG8)L?F_CD&0gp=3L$1D3|5#41qkN-gVG`VL|MgvWb z#T3ZYab*t2OX=Q>b|G7itWY9swlB$DM_Gh63LU)D1Cuu=&_isoc3veH##A6~3Y5*Q zOj;c!``o2eM|f>;vff&e&zKhgx|6z8>;}+#RsY|>*Xrh!A>v;P(M5#vBsGpMk5i!5 zGeyG=&6q6-@Ykg5H}MVCh-muVS1sU_X0|AWcunv;o=}zdp5{)N;z~S&)DnfILAOD# zGAlQ*y2?<{7tUUhh8@b&^x#s-fiLBPf^$i{XfF->R@8Xxg;sIN`%){WBJw(#y=`@V z`Bxi5b%|cp^~t{9)~r-mo1Cz! zJ&6&KjfvnE2}Nd6uSw^`>sG(gUK1{x=8y7agdFnU<-a(EMROsiJX6RxI58P@8k+rw z=L`ZKSk}@7CB7ixJrAs!&QD&qf4wMo2)k@DV%JMI*Dgp(H+CHlR!znhMV)er?T`v| z9$yYccqqf@FkcX4O!Z52d56?HO*}>2R3Y`TZQ-K=*-UM3;YHyB->$;s6WK|9xaig6 zltfgQD7&6)h#f!5kN=&rjr(=q9<#MlZDoe+L1@>QUe4=`Br0hgsj8Bij)!#%zqX~( zpjEbb%N}==#>*DwLiyF^`S*~uq?l~Q-AVXLQM6}~ zRrlk!S}3KS0ai8y9x}k0?+)W<$I~m@mF_r%Q4`tWI`|o+q;+VamxHW=C0P6(kl(5J zXA_Av6qs3?j$H?XU{a_`_jaJ6|W-5+k;@e_8G}HKb zb=7JPD4UoHeSuZm{dAK0V@webI;U-N(mt8TU`KFNt%?5cn+@0l{JZ?GXtOsg;)1TQ zuQ-B@SL5P`Ms|*vc)Xl{yc9yDx#i)RWucu^#E~0?57icu(!-^VF%|~;vqGC<4NJio z@!he?0SJQ8E84$pyYWfJ8g^5iW0}44%I@D_WBikck-0gLyc+e+7aZ|r;~G~OYL)34VJ=y z>T3TuMY=_8IVm)l32UvGb)mWA%%HP9o6D zRB)cwZN$wi-JJ@wnScw#ms*}(5Ha@b4yt?mboKhDF8woidRE?s%xlLSq*^)k;JGS6 zBl3_u2RXbqSk>q>UMK~&EE*9#% zC2_bNNXGNM`3&ZMJ_!a}#J^7qBc{|ip2|itKIZ`@=Qfn|7jz3x;!FqGx;Ae4w4~(5 ztCP|@-KnI&-KDPO3MDv+vcASA5cKjr`6dL21I<)Q4$@t`NO6&U+UdbFt0b*xE|n09 zOQm%?U&OQDUaBN^a+BlU<95gRqKStT}P z!rCd99B$`V@lLZfZ`baQ(p6V;#>3llX2+vyL%RhX(7F-Z&RLe9Z-}fQ)5AFLG;|%? z<4%nch8i}Oa_7xxUPD*SFv+**=mSqu_53DxJ@2%7(j$B(!IV)W=pycq#f7i-|M0&$2 zQl!Qy;(F1q9Csu~X6O2Q5KDmpThNw6#yuZe!3w_7khwDQf=L zv*TUogile*ym8=T+{=@_gR)_(^abboVELjfnikcO20ZtsrKMV5^iZU48k=gtjjf0$ z@%xR8h2owMi`l7yVK*A)r=?L(%WvxEWk^80cX!o+HB+pNFjmPpIZcP2Q;kCRT}dM7 z#5AdM&BNqLg;`~ppDV$=dF|Wwc78!Hik$fVV`9xCc?F8HI&+!(-Pb}mL>pgvmAMY+ zex=2ipE_4e&Y7_kS|lGNMzD)4wA(*c-C0e0DFlb=g&NUFaBjmi3yCUUE=R!!Fu!8T z8gwNwzc@v92U{p*nc>d?6@rDHxo#4L@I2)AmmGv zjR)E&J6!Q?5&X*?P8TIVN@t%}YN4&_gM4R&`f-IJJwMaphD=WXY3$S2D*@Ue>~i0) zK3Or4Ro$)t;}Q2?pV`YZ!&KajU-oDiiX;t1IF}?_kL&no>ezvKMy?8L5+e8O5T?W6 zC1@#lnP*!nsEDOZ!=q3>E2=$&{TX@?5=y43?z@)l)|`sc`Un>MF7Y^y3Q^rfc$Z$~ z6a_<1UpbZuDoLRx`-aQcw9{@)b<|Re_9XbQ|DH=n%qhH%Gu z2eE()l#5^q2M7Orm^}+xaKNZSD!K$98Oh=O7=k0Yel4326fm@7AfHWKa$=!F7XrT! zr&o7&D@bDq;~Un41Xw(A4s2z2fKek$Y`^nI0FvuVT6Ucy+(RzQRd~U0 z(b?v59vb|<6{Ec6g8};0OAx+(BsQpFM2c$W^fa=gNs`ETz_|ABJ(Y-`y+YONw@tC)Zst6 zi~TF)z=iyZ&LI+ejc09yTSD7qSLxzVM`+hb8Z&N~{h7u7J(=FFP~R(Yd4!^ai~YFR zlJgE5(s{tnSd$$GTq?q&OS;3h8)-NJ!&u`_D~&nV{LY*Q&|zQY3Q~}q&OitFG&Nmq`W{;i6l@(UC-X`ML^q+%6H4HF&&HS{N@e6UqNV?a4G zO=)M&1YwE@JA_^d%Njny{iCR&Sg%<*^tF2tA0C+T>G+H$sgK0_6vz*`Ea+)JbvRGbIYCdPt;Utri)~s!X)33tqI3gb1b11esm`k$T+`3 ztQsb%OcB46d8Y}O8>YNFfG?QMDX)V8zv$LP4HNWTw&Bw3fXK^DFt}>#lrUc?I9@!B zsS#wtLF>tg zbk~KysLwF8lFJUV*)s81-I?H&zfxr3=4s1$BY6xv=?z>idEezl(BQhL1L6JQ=9gq zz$C2ZiO+457M}IVd~QJ#uo(}poZ7lcJwP^L71*x(LP+9z9KAo)W_i9)_Yd2WS5l6Q znieye*wjoAzTZJHqpKN;CL*kj3DY*#>Lkz_uCyOvnS!JP7t6bcIVUaCaEKh$yA=RWmcl5)9l%d*uTZa1r8IiHCLk?MZ2rtB zif?4u7hsvq>EIwLR+A3UL)>R6x61==81lUpBECc#})YGbRPiWct zFNE0L*W{B6U+}7J-s3*E^X|g7S*snLaf60TtW zSr7l}V{oK*y}x|bX8q-39|tvP?ic*}%IwZm3cA3Qt#@R#e8o}@O?Apz zqwZ{6e6861s-8$zxubeZKMT4>@x+_g;Y$nqeuSy2+kyAygIY4VQ}C}#&gp>{!1orz zc%ZqcuOiS5c#}?dNdB&M;diJHo8lb27Cn(<(bCW;^XxCeTr=E0ibMi~!f3xl&4&yF zZz#E!*UFm^Z&sT|D4P#Sl=JxbDf}C68~!JrnWuxlTxm(@y*&iSm`v_l*I1sTB%B-n znJ&HKn(-np+zLJe=V>Sfy_3ZMntz|NPv^!>8{QQx#3Zy21~h*=E+STiaJ+fncUecT zZ!M`MXlDrmS+7jor&HjJed84OW$Pbf_!OuM=JuW71eVKFfJIU)cJ~qlP|WNW2a-W8 z9P9|;pr88`5kuOVVMpIT!d55+U;o~hi@|yto#`~SV3bj`S3>HbPu;d|+(f==jfDIE z64m)zb_*jO$sjj7rk2(rvw~#>VwZ$o+-h=8(DQ)|)@z2`AoF&$YB4fpM*dt%~$Me$aVE;Q>yzMLALk>T`TI}{QBA`%TuXUw0`Zx$*UrK zYi?@r-jOrN5ORpvZ$``+IaDyP3MnlF7ncDAD^v7QA9mbbRPv&RnwE@v?BLJ<=nVp8 zG?RwLwg?6j1Bkv>CJh~;EO5p&|U6{;5z7ar~pf^3{PWH^8(ZGI=${i+-uA;rE2t=sy$Gy(efv(>Z zTDenPzIP}#21axtxy#Hg_ev@*Es}aWlKk@zllw%$Q&27|t+`S}CU zHA3)Wj5jJU_Q|a-e~Gt&-m$FFCbmMnql7__E}B?bf%TCLTEJ6E{`YMwH^awK%t=%( zL-e71oVdihET8&eD4Z@`a+q@%yY7Ls*qtv}jck;6a?)-D<3Ye%rO|d%)tW&uemN?+ zMoJL8Q=KaMyMHAxI*V|MO6qKQZ4d+_rM@$Xu}B=8G;sPbi$TPKeSKDx@L9+A`@3SJ zE!BUbh&vB#YI74hnqZ5;OQQw7<*quB`D}Bey*Y=03~!-zNc>g|;>LJ4=Ed_sZfTv{ zC#eKpAD)YDFYb+C8fyVlhH|z(H6j|;)9;0-S+j)VQ{bP`U2oyB`1|qH*@H4tyQr)_ z^F!+`7NXhqOvk!6(P@#&TTGS#(mGb;UKLF50KfGbGzyAmyA1Vuxa2RN%U*%hn@>;~ zEYb5i$TkV2a+4o*rk~TfbWkw&3R=@ToG^V=+c12n%Tt$}dGK&^U7jx`hhl8{uBl6& z%%+N99fEr=;&@)S{O>h4TmEbVfug5pd(vN*iQNbgkKpk~izx*lW5pfkC(>e7)k;30 z&Dlh7^$5!GnrRqVMQ2S$Ux~2X*{b-Znd^M&k`eAZXPACiMSw`vyELbseTS3{iw5P( zC`|Rsv`qUPD^shLQS8=xdAK?uk1Afk>j#g)=fYxvxX!N@4waMA1VvVz5+fYXy4Iy4 zHI%GSbDzJX@0+b8j`;Tm#mJ>S-7^2gb2M|wF#(`qfc)f=AK`JdocMK*CfD9%V!U^b zVMk&Wnpiku=n~{uP0dF~@%GuHk)aB3E|Dp375jxgcty1!9VemAHcJ4e{%}FWs82MU zO&)Kg*x`cwhp9Wst}cpbGZC$`|UL+t9UnjG9Tl?a6dvxrRE54`PGHo5P^WMG{7|p{$F6OKSNo6 z;mMryYq?{+LR{8HUMl*unwVC;q!DA5^JAf&e8`{Y&G}RE<-lMwzR9q^BuqX7)pR-^BT44w&+|EjlrQmJKmx- z9a$t{tNWvTs#-)yg)j1{TyyGnTALIC#jCVsz&jfZzWmCbrE28sYIg=01J}#u?nTikF1k zQt;_gilAuQ=<0w)fJA+7F33Af24hU-)pkJ36qu%?38Fr-dh(3lzc1(3Gh(tap91u9 z=<&izsormmWS`UfCVN(b!X`3}q_*8)cJ`*Sn-`(Pd;P;Hg8I3*|QV1x{lCUVf+xp($%IuV%eZ|f*^7FMj}Mc zLG-GNK5FkRy(gDh#eC>=(6+2Soj8!fI{a!IC*Fp|{C(BlemB13}x6OLB6WK~L5kr~Y34*{$A z1`QACNs&ABxG_dVa^t7q%lmV&%88N_AR*myWP{vOuLOSA2V$|{d@oONe?=2ogy5EK z0_M2$DuRn#&og_WgmWghOO`8#SHF>;EFRJaf0+I0iGsU6cF`s}S_%aZGPru|yO%O0 zeAu$^uDtD=lBDBGC$9#?%O$;#mq)iVAyeSQLDn+u4HDlsf5X&U&zC9c#(`ZffxY3} zf|t!))S|UxAVvmX5xO`&3mQDqV(6lb$u6F+PeR|FQ|2~hK$qHkT5F(#vaT)GfPzaS z2NYr=2XEPVYQct9-@#d|XDw$#*&dfsnej%zf^nH!wt*C3l`g^h>4UU0V%I@E(mk6R)l?5Vd1c8EFGu?*PQFPX%4E>}NZ=o;n@Ncp~qWi#d?K$l7* zt+1xR+*%-2q;r}|cX^*v)1ji_%2g_(Scgz?q_pt!eL}C{4Sp_op>$0*Rr`#Z1fMun zf+@SXq1*jJGd2A|SmB)wrzQ*x;2*kr00JEJ zB%Cjn{P6XCAjc4IvMhxbV$|Pm(xXfwuy4c9X{-e9+>xorc+E!qJ+ZJT={9|??DDcY zxBsv^@AejVn_(Xby{lE#{zoaPspgB7=NYtF(-EQNy04>R6{K0D8GiBXYet^0_=Hou zT_T@O*E%mj@^G6I6I+F@eZ@rFBZ;2eip8hh!+IN!4sLVMDOIgtTOZy^%JOI!G`Li! zvF>~?m)8pNGHi_vN(un&=((Ci=f*vVj6)6rxd=&r@`!9++}R&hQjjcTSQ2 z;JY@T;tJ(yRzuj*ZPeq?ETQ7LgGww#ACBFX?SYV0b;0fVz{F$370v(9x-v`%SyI=R zRtxqfK=Zysn_g|gAriM)G5ZpQW?+@Bg*E1hYB^S#%tLrvNF3Gc^_a0rjCh|){+LIj zSGQL8ef)dWpTP*7tdIr&Y`eM&1h)QB4y!gxh@NmTu`+3lij%qV;}@X{#^n$1F?YI7 z$KlMMnMi#=tcY=AfoOp-K{_e!R9@^5H$eVVC))7Z1R1FxtfM3^BIe5RqPL%{%N(uP zP`4D|zpbc-+PzDYYjSiGsO|MqKm{Q{X-T^Ji}OSLxKX{kGqNcY+Q858x*nXHaab;v z7RGL_LC-x(h;AgcbOwE>;+wT`8VB$!)>VlH_pYxE#_L!yuF})ec7n_|CGE-ojI(NZ zpXN)_9~Xmdj{`E42?NwTxPWoxacB{151AEYWTQ%NDKP07VCNY+95VT`mSH%K%nK+3 z(zg<`q@Mr&g{9GQl>aXr8W)m2$?PswI}*-Bv7t{EUNxM)eQ$?5qXh2fAPSiJ#}T#IBf8OZ1#U)Wjud*`hEuMy?jrTltd zh7D83+{MgqTO&pFmi^PG zvD^IGk0$CQ64$BxA)HR5nE`d4qVoQ~GDv!tsD#eme@cKK^S>XD#f2vQ`&U_(-nb(l z;sLGggE0~+9ydSc1}N!~CH$`xYiOE1a3~$kqB_8l&XjmA#k_1bZjF-ov)4RfwJl;Q z*UW4YSUAuLwE1$eWrKY|+p>Q_ zH%z}HYT3*(5D=>IFhr(q{sKoD_Te~)zMS~qPkDT zg^)~B`0xg8fmcRT5on0{_6F9XIiw5rIg(B)pUz6;zt6H6!f}(0dRjfVCvs!+#K*NN zLV2-E3CqDk{gx<{dX!=D*U=pOX-p@+h=gL#zCZW$$Je-DIZwa%$YsOPiIhfZIZIS< z>lokPwghK{!7CN26vbQX_q`K^d9yH`@EKd*>3fJ0wIG@On1QfY1K6T#fl+!QmRQ!7 z5{VvOr%OjEoJqR!dQ*yYUp$^K!?aBI>-c7+T2j-3vhYJz+5>r>Y>Cx9jQPhv=20~D z=%IH^Z&677Jp*B08I(Y022EsLS6m3ZF(i(OOni@i>(*2-vYi5D6rj>-2~<_xoW;&y zTs_en!}ZKj4BdO7!8$9_#B&nz(&lWhlQfZU;h95pyY$j)Ene^LWt`4ggOm}~Q@_Nd zBuH&L5{))2FSS>>YQmDQy zs8kgeFxn5fOz>%jHm)I%uB@=yYyZ{D@g}| zb1EziJqXK2a5qD+x$QI^Uv;`CrE_Vxj6KRsJT@fT9J4hh40m6d?UF9F_DfK@!$GD1x8i_vwzInlPiCZTu?YFmAk)B?@ zIW+DI(p(`xGI{A3f?7+AAx-RUh@rmN#>5DFL`bcV3Vy9SZU)>Dbl1O_se(d`q{=DC zUsZ#U2A7-a(sNGI>)E(%flE`;RJ`8j+#zMe9{c>cp`ZKG;iPb3xQkDdRs!8Gf>RZF3p5*6lvg9}ny%viaYV}4Jzsa8YJ3Vg zS%vy!Z;nSKvs*?^mZp*b_B^}OkMJfeNb6oWEffJ@+>2I`X}&-ft4z)*PtXT(`6h28 zPFB!a9v2e$96QqwW+T!~FVI`D)q&9Jh|z0Y!VJqKQg%st==iigYRz7Io$bzPRstjoRg)07qlRJEgbW#@c|Me0jmiei1(%6qpRh zAY8ynReIhHuhtS3pjCpk@UZUUgd0J5BA?0CyX-t@n;z^|JWBjCrUdG`^!x|!G?+Ib z7JPJkX2sdY_l&Lut|Ud>KD|6HN?wEEHT^KwW?Z;M#4^=n0s+C1BSk%>z2)8&=j9b> z%?Taa_ti(6jV#3J{ZEtj3VsImik-SVFe35g9j&o^6Puw7;ivo{JGFmKI%Z?m(YJTF z!4ORd=Cd~DF#2}`MRQejg8pX+fyp7+T;lFrkfBEwwQI3jrHzvwlfj8Z2*`*{1XG+eH_P+ti%^DLnx@;**e4ateQDkg4{r>I~lNEUNwm@VwX z(W?yd_EqvZ0-qxy=)|Cw@rbZM*8M{1_9l{mWIdA${*pS1yR1KA`qSlt7l}2ER6dFl z{A0XNyoTm0DGF*^m<>tA>Fm?4w@&UP(z`i7&LIJ4iGNs5-r!-MX^5jOGTTF?1xbu6 zw84mbMVQ}2ahIwLJ@ozrHwIWA+2&=GwQf{uDB0&RBg8W};M| zQW}xlic#5$rU&SXVZeht(b{~zJw}1`Q|t}11$b7Pj+W(Lv-xENV~|Q>&n&XU3VZP> z&8Z^_$V;x2W*M0vepmQWSpzHQ1t1~w(j>4PJor~c$0tswV_GTkK6wOF zdLs`x-ny(Q^}Cz!nZ^>EG;p|UVk54y#L?^5=Nlt8en~|Cj&&7Uj|@hlWFK5@)D~!I zqnGtR2r{sCkT*vyObnkuD)s$Nr5{j*UuT&|qBw;JWJU5?XEfGeqH1}UrD5-Cc9{4S zzw3Qi<8a#gO&8!5IB&XSRgj=gL>`Plwa|~7l<<2a!#chgWZ?z3B8fk3naM!iX`2!E z#-$v?IC)n?2VSqLgC0r>?MwJS)Y(LK5}5=~WZ9w&D{LUWhc2IoQ4{2^iU<~+dC9P7 zggkCR_zwo|htWa;Z&I`0Tli#LUoDdOnSwS(~DbdvnerUWRz}u*|2cXqF@mtpS zh%yA<%$A&TuZa%?CAnL%2YEW(EpJSpEaq9|%C8)=tVDb;v`5EBTj;GAeY=XY0ODOJ z+Xw41U0GcyCym~TlB3|mLoa=R87q%9(VM5N1tzwX-Y5fuWU4thmP;!%zUfhhO_&(l zxICyyMrW=Hv>lSoTeqKy;JYbNwOZ02ob*9qHZL;FNelRd3>X6Z;+^N*Gke~gUxsgf z#3cnfc#C6+_0c%`XPhV6D-O}4r+~#drL&&H$Lh4P@*n2a-GJwsr8BQzs(aMmF&dxc zl+BxO6I`jJ?5V#n1*QYdo4i&1<+j#+4M>nezk0uWT8Y5l>*#n2&~f<8o|>=-R0=c3 zT|hu6?+6B6>LipzQ5jc@i4Gfph)X0dDZQpx#<6bzCE2nh5UJv_`}mq#{LhEv48)#!McurF>6>+n9&pC@t8l^WvR}hLZdJpMZrc@>!?RwACbGTMzE4}t^l<5(WybdN`M}%-{k*Q%=F>n%pR!L0EmKdN3;~m#<952od z&&he$oBx*9R7@XQpflIf?nFm~G^p*=-s7+J&UbRam4kMF_ROpH|DzNQg6Cbl$trKw zBUA$GJg!O#Aqj+lW+YmD8PpCo!BYJbQ003EXNS0nLcPA2VSBpIwycLmt<#t#x~9HS zl!Kext9^$XY(?i2I7TN|3PYGx4ps@2cF2xW*Ix_XbPfNc@UypA4nndx&t1SG(&%ED zIVf#6^$^6?OCq)u1GIZ;XukyZ|65Hme8;YfG4lntzA6u`9lP0|n!_4y6sFc6b@b}i^oYXORqy~SKBHIbTx{qc;8=Xj!)I2;q z2zUhz9U$r7EpkilZAhv@3?V3ms0f36*y=LL=5bdWYKU+j^HlTy(TzE81-)TFF3mMT zq!Yt7=QBAlc&nWwL{4#QiE`4x7a7@=6{HdIBQgdJoNzSJn?f@;w>aH2Mu?3bVA~Ee zYZD;7)2|bUBBxh>kyzul)6t%Yc^6E-Js(DwUPX&054 zcl^t|+c)a_h#t+503&)?{VXZPNSM3*v)Udr&suL+?`!1a((_2I%y;R7)>zS^6hck zx%hD(fv?IOFmj~Syf!>w76HBoR7@%H@YF@y=6K?BwGVUrkt)8!?)%>JOX04bnfdb8 zgWx`7?=MtU3x7l+vB6$CxZ_Psu+1Kf=lWuAae*FPH-{r(!Z?~6W5eA^VeqI`lQSeJ z=1pQJJDbm4Wu$SCSD;P(@XBV}iAZT*e7ClGE>xDL;+OhCV8JRLqsZKGW+)@u*A53_ z#0!c}ObA}D%QUzvNiHiN*m9kQQ)tD>m9Cbn`R&PE@q~?-KDyeu%O(Tw5AAWpv-fi=7p`DI}p9FE;P7YyJ(f>ZjF?OsjA zH^@MO$44*TXmTyDW5er5h1yJF*doe|Y?w9b<3Rm3JLtXNb`!%`nBzwqTUuKkpeSWy z1nzCwq9rcaN%NBWBiTDN-J;Y$(e?#1#7|y*a%K`lb?rgw*u;Bp4Pt8F$K+T%-)j_r z-K!AOU!f<7X+(sb^2_S=>gaH5dy~|t*nkypLg>=ck}GJrs&WSEmE>)Q%J?*UO&v#K zO3?TrG7bKE#GOL1Q#W0))aCEzi?G4D3$9fK&rc{osS-N{{MaM@C?O108H?v3Mu4Q9 z<;`89^E^{rG=RU*dP-IQGrmps_%-O+zJRsMlQ7##*ny z0tVs7xJlq{zBRr)p?=d)BG+Zcz^$Qw&Ks)iF~@Wkh?Bzdfk;|((@v8>+aZm|`JA&n zR2b_JiHibfES13v<}Seh(n>Dh-iI^sk9Y>c;$UtMi5lSbXDbAjlA->@jC^ah^C-wW z2k78Ln-FTE5r2F2SC0~z2u^s6!ZhjVB(aSPUGz*dVR|^B@kttK<$AWRG2FGK*#*3! zRDGcff^nYF2b_dc;7xXx2yA5I9mVjSZL6PK3^1CC(2DTo6hRZ#k z!Cz%Jyt5bd^t8r6QEPSg?yYWZ=4(cl>tYG)DTf~5p^6h|h+n7=|2Ns^z(U%DQZ>9# z)btA=v>V(*5BAztJ4~5OMy`ZrPyBexU`m~@EJC@KTWYXEj=&zRjrp}3*vs`slPnZ@ zYANC^zrBdIksBp+n`bNnAxt9y(uxoJiP|bhGLT`==VPY5p(BhgRfeD|dN_a)oIY!> z_%vAju$qtWi-hW)F~r6V3t(@wX4&@#VQm>!?EOlyayy>$5V+g<89j!4AL{`~Xru!g zY;7Q|#=%;&YZrvit`2~Zkc8+{{G}W1z0`ElYYXjR8HAS~=fwf0u zTvJ>ffCw47Q;vj^ztZrU1lwFdNv-UIe6ye^(64uAcw1?;K*Jjt*r?&EJrV6e_f}Gz zbL%9d>we`T7S2s@DnyR(v)E`gfUba)u$9en1*Yxn7_-NYS8t|x`PD_byEO9`g8B%wG-_vNeF&7y;S+4 zr#G^z7a8)}@V4Uqj!5m7rIBv_Q$532qrWT}xA&*ufQH0YGgtCs`2cHYTepM{nu*^x zWmzjwa6HB1H05=sJ7}u8^lv!-;S{;YsXPL&8V+U$0sWVa>dxD4%)nRb?woX= zBnZ~a_hLvj6h=0PU4>Z(J&>smYU9@0TeZZ1nn&G>HJ46UvsHT4*=*wDhU9c z(p|6+3Ti~$6?oqK1g?4E8k6_!l4dq5{eWX)isPn$+fuiPEfqIHD<0#u_Cwe@-S;ZB z^)x;k<6W#A^aN3N+%+=Ul-^*<@1JNkUSd^mLY1?^6c$pySQtfC3Wp$*KcAwDP}POf z!h39<2ZixY0QJ|sSbF2IMd5YGjWDlHfu7Ec;@fx7Gc%QL+aLGn2B$}>BU~lp^|TDi z5j2m-8FjIM%725(&mbHkq+Bv{%B(IDTA-MvdypdX#Vw6`SVJ8%)LhHxbW|;j7B>>P zQFEW|D>*s8U~l|HTv+2L69Aj2gUc(y;4n*Z)68zAicTkl;{noU$DZ4N??v94nNu0<#D!Z&m&5j6(U$pwmo#)^h(LHYt6w8)r#9bX!3l zj1#@VmxUub=R`UDR^e=?OgJ%oxMCg%1$K$S`O|4~`;*qVH#o(=K*IXW;v-Qz)f)(( z56vunKS1g+EyNLUzb3wTinw*q!HmIypb2B#uj;LTc!Pb4UohI6U#~aul_Up7c38)Q z`AS6PBau|2e!5`{U(?}PRVs3rUq%3}3+3SlA-;g}10DlP3m0V>WeoIVAU!qbHRhmg~P+CD*qQaD$D-@M`dDRWBDHh>VI%lP9|2? z|0(}pI4U#4{|6k^(?lg}tCj6msro-CsvD)&_716lJ$XC28x)Egg<7*fKw-$CUSwC7 zvQDq_otNAj=iBd(7?_7gDWBDB$Ep;@yh6EUeYiG9MnG{6&HxQg^>rYCD5(oMcs?;W zcNEw7-fUtDB>`HLxQv8t7}U`BiLW;z{ubu z_oAI0q?$j^{Pyg`(#$LvfO9jzcoS6vP;w3rp%6AePZDuIj4^C;8wiIc5Q|_Z04*$~ z3#@;1XJl$67%(+SeP~xxQ_|I+eZunma-tzvH>&z-GLV0B1|X9bR%YKGt6(g`ui7+# ziEQ|<8(#Q3$}kN!7HjmjWAnwkN_pf5spH6E>Qc+e6_WI3TIDvmj4DUI+SKA(Mlf$=kDM>92Ic-V7L?Hww zk2o4Bxr7Uj=78RQU$x&DvR@Si#J<@Xs6B)|@H(L|)rmRb#qmCX;Z5jcm$#h_feR3K zhiB8z-9W38^P}_g4?jvOTP#{?pOxLg?R4ph`OzUjmo>A?W5 z8r!MCc)4&Z^rZg62@Y)z0FYl4q0Dc|XC7hxpH&r;z-vWFQc(|kKag?6VCgkTXf(*r zUtZ3uen;`n%}q)G9Kq+G>e3$@2-o_$*I(I%U>=Z&Qi7tD_6R=3mgi`pQ5lqFeT`vF|E3OUYN1M!_m-TOBk|T>-W4%N3e^J?cAphE00Xr1eahm@M z0B=r$AREBPPb&=o>*`oJgK+5n4cu$^dS^$`j~xX)JbB-$!NT%y><8h(@?XJh0Jax?2qfjB zJqZ9L(06db^o$>D2YcbbOhGW<5ACnVf6Fnvb;Ltgp7=fdhFNI9CYF zC_e!>0PJ&o1fkhS_~YQ=w&std{kGMKofrI9gip0U|9J1!{}!_KZ(t!cvp0B%0;@lG zh>4SXa7O|6Urh-3;T1$CgTHyi^$DR%5Szj8vwUydgMj-VP{LBG^VU`_1xr3-vGgxcFbe}q81|28P)@mn;6aOX#I z6de5a3;lamtNP>P^y5O(^tUnk=T!0+!rEUe=SqN!4Lv@V;g?PUI8e)`2X-C*lnhK1 z*8TgEwctmR|KWuCJ)|u(I5CwLk`;y9KS&9%@v|LYBFHs$_J`kl>bFT6O!)(Q!u07WnwkAy$FWV+&+R>Fd=vLHB7 zPoJk=u0Huge0B9zXADqt04y2MI@`rD+kjpjr6M&HStkv8pNsG3$a2uR`Oj;nhscsV zH_JQ>tZ(~(Y#69(F!qPy6GON!h15NYSk&*Vu`$0T&K%)n zHC75p6iY%@$&eH{Uz$zC<0fOHDEJSM%@GIq)2Bf@%*(!DJD5b*b{x=B2cht3%LoJ9Lw@j^?9e?9awN z2L1C}RXXY)G$vTYt+Nzu5xIwXeDe}gy1o*Y>U~Im9g+ayRwV=4I%f)Hp_%^LKRYAuq>?{v-DR-R2eCx;)yPPcQa&j&`0#nhXm zbnv{EBD-5f`IT;wQCAf2Ef0ZA&(qxP<>RUR z@O!Y>{Zucsm>X^!iN@VxHC5a5Ofu4}y zU&|7kh*$7`q3g$Fz19$)j?)P=dg9CObzivT4LVC%I0@Hxyb5Ark)k1YMjA%{nOtit z?{xDIVgQRSU;!veQB3(k0c%_LrKXvYlv?mjnP)zmXlHNDk(tpeF{1L!J&W(GWiO!h zIO~(cy+y8btJ%b@lG#{!npj7C1F#z^NRkH5h>9ht=B@;@zF(J z>T|D5s?i*R2~WFuYnj#&-eTG_%NFNjWOprwrh(?Lj*BfL;tml(nlZm<7On ziqa=zVOSc+yPT~N-;_q zjv%%ltl1%?@3r>cS}+=N`=9Q4px&FM<>?PLBZwoc5o3pU3)qy0)o5a$1nq=`>tR2= z^zWJ0(RJ(be5&)oQ?Kf+%A7Zg!e)jJ1^G3uZ9YoLLdP4#J5UI+g0bDm=kDrOmuGb} z@qC^b16BYVe}v>efW(QSflsP6UqN+E%|YrHl#@k=mAwV|HM9440jd?@(IV7B47f zXXlIt>UL%+*3K2*D&a>LnRkbUeB|BU(+Px|&FFSwHBk4e^NrSgs)yIa*Vg*(@m{pg z6T!EY@p*JoU?1OpfsC;o>&ugjmBp=4 zp0KYw2ikjXQuPK!Wb9N{ES=$&K3CD2ui5O3)q<)C8kbUHZFPlym*YPsp?@%b5ZeKV zk_~RgVGZ_JA+Axi))h*2S?QQl8@SmNH~$*Qy%4E&3%4!{QIHACCuhwTU#yRlG7j^9 zIz-=&pI$WgLasv}xN{~R0qcOowlyYRy8$y5=)i495TLcXh70bfmtktz6%Mr&7bf}p zvTF5%!)qQ@+v$PePcZm0MC+GPWCOgpR2;P+;P7$Ah^&^T0=Tnv%qP|EIA`f2IWK6$-0reH!*}gS%g5JJakm_ zRz0X(_GkEPXENv7O(>y~F`LM=tQIGcT%11OFkL-et^Z-1^2cs#N^@L)=S!gTERY1N;UC^ zKgnjf6Nn;#F}mW`NRWrNyd)P>nB1<@zB%L+ItPZAJtyp3hlW00tj=P1ntf={AKw~Q z_eLBXJEFnXBlZxl-ptoS0WaN~t3I1lrPC9+umbLHUEB3r>dXbj!D%ZzU;lR;=uNbG4ykHf_wDc8-Wu=@_WaW+DtJ%u^`iTwoc<4=`NbRoIY>G>s;N=4)3lEz+e?P+na}#L z(E~6DidJhE@N8YZf!-Qh=v@&%b*Iz-eG~xdL%*<=eSXV;6LSe4ahBY&^Zjm4j>m}K z*E}avX~?slql65Hq^wI`ilGWennMC~=QX*S#d~9u%M1WW+gkmaY!P;_xz%xZnt68W z_1Z};>?ZI!2Nx}}`qRL-8^TcYY42WJ(RA%LXDu{Qhd;s_Q%gtm6|v$izPI3Zx_VBK za7D8_L-o~gq>YeX28Hg!G>91*=4q0~&3h3}^}=F(!iP|+h}exK>Y1{x^ira=5&|k2 zjZK29Zm=*-M6-DzaG13zGL->6*5xmGgXLs<$XfqqabCYl5-2WIg707fu3J-+o*@^m z>Ier0J;vi0?U1le+kjc=CR!;$I~US+odc~t3IN_G1!zMS3n>xLmdu&6l!SSAzmtpl zCe3!^wf8jfbyeK;aJU|K{0=ML&2T$xGNp)J{2>4S=>z>xUzk>vE*(InHadopmnGn^ zh8R?G`vg8%tM<4V3?y%zbQ<@YJ^tDXI4M!wZ-T$z!M89i5E3$G5%d~MOhpTllzGPc zm9Dg!_w_GT$$fI~QjxQ;nkUECTLcbMuSv8(W7@z1o=hn9@kZ#as70^sc+koq4@bZZ zu{|d4y@K?$AfC)q1Y=4W8GxdGD)Nz?nNuNOa7hjtj$5uHjfJwai3|7+68vX zN2%88+*(4wTEuh0+XYrZM)VHb9~q$=g9slTchnrAH1XBDVHNYyqI_-_iFg)VlS?u* z;#vt3Iky811Dqm*+G*Juv#)WC_~367LAsZAa<4QGz>9dEu2XO7@G2kcmMN^T7ayve*kz6szBRVt-E+%fDjYBxyS45ycLPU6Xujc|SdlAl zU3Dt~7ebaLLm_t<2X&)-os+M~d2O1;v@{TA5H~TYo}*8GRl?kEFgj)Aqz9-XLf@7c zoatjpK^SGnZh`no3A^XUK>+seB2dSGGRKjBXS_tWJ*_y)@bT_!;WLXko}p2}Y&%l- z*;X@ZRpzTR%6>wia8DOGRO0zxMwGu~_BQi+Q?WU~92u2h`4mw~A}aNHjE0o*9*VUs zOOYq&8)}=GpC#7@v99=euY3V?%@t0&_cN3L;b=Z4KaAG`of9E-%hVH@QwhCgyCtek zOTL_H$gdG_1cL~y!lcKn1oXGauIyq!(#*AOzne$pM$4Hb zW_d4z5TYy4zIO}LZwkSc{PDX|L6&h$Ng>9IOhAroKU+sKz-vPvZExYD$JBZc2Bk^g z;LeP0sk+IM^Zr_Ol}xrmEM*E3HHpYufmp*TyE9I3V@Q04Oah>+RzIT(BV#Y+1_5jx z+4lbsYJS@sPb40Pkv7En1iZn^kh;QK=^HFCg=9aaT``-f;K45n@d2lJ+Jdb|_OC0T zeYiV+4-pj~&4NWXDGc-uEB@3lTG8lk_zorzTs-(=&HFX$M53Y<(zlD!oVwDPKLVM} zCdS9$sdD`dK7hi?$v!h;LipC}or@=?L-6L58*|x53MHjaGU_qjNujAR!A|C1N#*N% zNrhn_TkuL7Si5d7a<}tjP z4^Hg@(X*B1uWBLAj_-Cnps*yF>FZK+a4ctTUt3V?p-Ip#p5=j0tO6Ta+dR(88P47s!00mXOt{ zUa>Z0RZRJSQ>fA#A2klGwihjs&#x|@XRQ-w|GQcu0F!6gLJ7f}Bllc=CSTpIP!5iT zCaLCWRoFsVY-!a1DV0%EGfa}dg$30|lmFt08jc%qgtCU(NHl6#$`|5Grj&QgMi%bZ z>(Lco>BTK|9W<#jYcIG*vzqEH8=kjD3+w!}Sa{Gb)f`&Qn(MhT+h~fKy+_32tdd`5 z(@vwTy8-ZGh78L*Q2RDhRuvx?HTwQ#s12|lUfQFoJ3+qeNH?`6E*zZW^ghV_;z+#o zlt4?0|Nau>2uU?a*7?Q;93}}RuEl!3kv_T^G;i2$n=i-*yQmnDZE-@xN7`>kUr6(E zI+Jrcsv0>Wu<{_Yf6V}Lt#H-@hNFj1`_JBv?pt4;!B?+$Ao$`|vt$fYdbzE)1+X3V zslE|j!SW~~BypBB2w0FhruXfLE`b9|>yHZ#uF9fb8n^r5P;}y)jKaMYJDiAphv=h! zjn;}%ZAo_b9t`*Egk>9`GsZJ4TyI%Po;cxrm$buMY%Ge9az++gc^f5@o_q;TvQy4chA2DWLnPBNPU7Fcg1L9s~-D zoSX=@5`X92r3GT;hIdO|)LaV~5^BdG+ImUsQ48*G+?k@1)0w@nKWB57XJ7-cU52w8 z$543dXlsP5t*Z4zk{lh7dose3Lgz#L8*tA3F8$1m_fTSf&Kib@ZNuB?{4YM9z3}GW zMsAgbBniQ@W!M*Nh){z?&2_$rMEP2xr2!&(2l4~64zC+{-@0|&RqzCMj~m5YoIjKK z9z6DQVD$FWpl@{ygDhg@eFgyIlWCD#FJ`$7?!}~}#19qoH6{Bdb!Cf%Y*w4_D{^yRe07 zVRe?f*;UnVcEW8#l!SkItTPp-xk|`_}^vnb3LXBb~U!J&aBaEEdpMhx&+y-TLFCk+#Gcr-l-^yUi#$V2c z!Pe67jk;Z-m-N?q5fBCY$5s2Xu|sFF(^VGm5%M-!fR%HFB!<~nw1-+^+3IFnWL-N7RQt!LeQmt2`S931!M1K8~Og*`yPnVR;LsUpbmHt{Ck~@PKp^7#5-LgU$X-iAN27OhBv4mKzQdWM$8)%|@dj8u-iVs{wBL zi5vr6X1=*u7|_z}%|mB^pzlrbi)L%y?O2=qnPBRmH3t-I47q6iMo}R{l+x9=eVFc@ z_%IRY53`zDpHn6xQdK1ZU*~3hh{$bfzp%qw>f0PvIXs{)?>!xE zQ03`Aa}BjOwzFeIT_;Kk3hwD{dCn$oyCz`*hgVX0B7A5|H zLVCVNpE|q+Mq?Nyghvl{bSIK{RWXPV1-$IGcdZ?Kc5>b5Ar2iz$B~6kAPqBWoZ*`#&8xBoc-)X0{F1GNE+;`@{dw_UF1WUZr~1;nMa7{`EvdU6TmmC8P7i%}(HoCR(2%y+2(-td;vv?O>JujlX);waBF?^?VU;OweyeAg31YBmq=_jKtV zug^bKST&BTg~3;RL10Dpu2GCJHKz3(1~79R;XsX@97Bi0xIOa0s4wV`MN5|9>Y2H? z)6yxd5t2SX8__-#R%CF$`EvyUQL^MT>fg=Od%-m3L z=tR!VR4Jgk^~6xLug}*FiQF3rm&-U5u84YFI68?>*rR?HX8C&-u4=H(w9=Xtetn(E zyaCr=-1JU4WEx(cXtM#iHCe=PO|qnzKW)8mr?FYJb;+cJF??m*QL=@-@c}=+F<| zfnK=mWBnWwDY~Q~s}{==Tg}dtncaPi#3%^f4FZ;VU$=jQji)%vT_uNooW6r?=qc2X z!q=Lxg-Zcai$AC>pM74Kw@8o^WtrhCN@s7N#)S5w2BVx(C*qJw%XDnN-a&H`Ee?tf zxN3;$9blsqo>VZpWFnSW_bsK}LzvkaWfB5&Ys|{LRmy7iqW+%gcx*t;Kt9&b$P8vu zyOj(IBF8PNds3Zm9X%evWl8JW!PcF+RI9btkEMf{ho}|%r#6B$YoxbxrMCZ+6Nzy3 z4;p!-lEGYVi&UZQSSHW!V9d;Ae-FPkhPzrEu0Bqj0gh9Rj+FOJ5_;vA;O)3p)Btzw z-1(pFYqIoUy&egh88+@NGrqPvbY|nz2OqKj!y++Je(6h+X7QKWE!4vi2AIlpmMBIOO zFnwi1Qn9dM^0k-VsOpFlX!F5Ji%ZveMm_9+E{6nu8l4}ks~_y+6nHrEk1nVV7tHBs zEUaeKU}(>j>H41NPPvV#bTiW3mbCd;cZ+*1N%5c3ACZ&RDCAN@FruP_{k2BKFA0&q@g`{DFR{ z*o!dRqNcJuP33!v=_5C{UZS8)L6H+~IRDr!i0gT^HUm+g7ep!{-ZY|4-JHS7gVW-R z*e6)4ko9k9zf`^8;*P7KW5l|LUm2(!f=NixCxLrlNPVBB)4$nIgt)whz9SkuOb zepEvGLb|u@ahpQ2&glrx@p>oC;D5)yW*`o)^Ea!VvL^5E>Y$^ir3g!|**)Z5jk94y zIX^Ud7 zOKQUw>fpqv2Ni^^U}JMJjHACpB+twnJ)L)aQRS$H;L-Wv&h?umnd{Fi>1QpbXn@Hi z4O}SJkrOEOkGU;*CA_zz>zrAumyBN($eOr1X*=ZF`)G-ZBvbX4VYZ;fvA%i=IqM;h zSXM%ET=Z&=bCkKaoZ5Yy8bS(MKB%d@6UWBL&lkcyoagqx*D$rO(UDb_@# z|AG(kNOnTv!_Lt!t5t6KG75R}#SW+pU+j`ge8_($G*NRyWI%aO7E0=X6VvRvk_rz` zyg3u@73KtKyQoqnKQp52^=de(TP?l?1$#F>0i|kv220r_{?G zI~{`oCM2)hrC*P@;nuv)n=IxcC)8@xF@&yn=%++YCFz)+^)`-#L|UC3*-&AeWSbn7yYgbM0QAP|T@F@v&g<8I zz#Q0O^GTX4q$OMe-#(sDwPxBv+`p4}@#CQfwwwIMIc=x@e6LV3X8u`&tetxdDkr)X z4PAtpsN_<3X*2b{t+qH{lK* z#)gqb26+i${{RPDB;+(bC;+`0=oEW`b&Y{if@X=bdM$nS0|}Y!ToH#{CUi=hPI8wA zm`n13ZJMN7F+>+9=%AmT(RKp}A%FJ#qAWY=`jo+uL+1``@Kz73uuXqj_1Iko z@v&xQjP>;(aACTN{E6hHGAFzBYU}snrKtLiuN2 zM`7v1b7-rTewy-H9F}(C&UP$Teb-KxsO%~~PSd<$D2kkgU3*`z9|r@B!a{&x7j)kL zF_R7<9;Ag1;;U;@$!qkLN~I}mk?E!S^2Dr|**6)V5{Q^Wxbz5qyE)aG)M(IsI4+ws zfs;m&-UPx96~u*{Xm!1}hx|VPI6%k0CV@^~`4+k#g@?t=J*GQ(roAKoVg~jv;rK7t z<^-`5u&&crsEaCnjeBTpodY+x?*71>}Cy;|UW*?>FwzbR3)k z2WKpN=hm7t+^0z4g{YS0sq}LE%&?vbR!>Jgiwd1cX+sK$*}+R+O1EHNzxaR~)6&a6 z1;imv9adz7%sr>CeiRrk6|L-@5SLhC2#_d9Qd9Uk^W=%e=s`M2OzphhBP2Qv14i|9 z`)H5kG=DeNRhd3Pw*BdFz#U(>XLRfPtca($+Yh|(s6s=`UuSb-uxuZu@@V`_o}bP) zQ4yW^nodX)YlZUyJsOnvX$C5>-3bxiDwzkQ-KHB@Q?-3iMJaG0gL;J($GdKz3d>ol zf=e3Vb#VC}F8uQWO~s^5)ZQ=ry$)*@V8G_9((Dv9+E7oMnbngX|8qIweZ6z>1Wo66 zlkPB7SbBsD!ZklQJCi|lJ=Zj(>%=i8!K$3qtLN5dj)%{@B|Ez94q2|U_S4g3EA_ak z@K$i>Q);?M)KchX1pu!$ctKD#`B{-nyFE{}Ar|7So1CGHWLpXJf=g2wR2)ZUP;RyF zSf)>7-L5Zj?CdXKI4PYm#rS#vONmSAeI$jDo+cxnmpyYSmC^I;;Z`N_8+L;_Wdj5os9FLQdOFQ zT$!%R(x<_OcW-O>M)W0X?yHuY;Xh;ZZZTP$uO#@slO7Pejz(HNv~DQjb6NeM3N0zR zhMugRaFjU;#h;ws7r1UxYoM{-(%$dU*#U0tUrw4AUpB*A5YqK#QOO)lI@b5D2Ksu= zh1WY0XuD}*PG%4UbGp&BpiOItrBUi;Op9Z*2aP^CVj=!brgdkVl#<<(mQagWYcb-x(MqX zLW8*RQ}Vr+!2*Z2l8YMLx6Bz)mU7kiui|(IkW5`Kps9Z45*x>HA4;DoN4J?M@p=d} zP}p_}&#LtE;*!HI@XW-$-xmo22%1%Wya>5x8N$I;6@Gc#9yt@wr}u0!Rju`u!ZclU zAA1qDVHynvw|?L|uD$ihoWl5^JsaYmkZEfAFx<=#KfGvcrUx5^aHh=EsJyg2vk8*I zG@PbyAI8NHwSR+ZlIPt$4ett+~U(G$|LKU!Hp&f zL+R%JAlh*6_?hqaIHiX2!l>#)b}%l_7DDq`*rSiGjuX`=;n=OnjN_-War}fe>jI9N zek(Vo9?lv~BxewkAc%TL!@ppAdaF5g+Q2F#aJ_c^_$UqQYX1}0(#|hHfzy#)uO!G< zchU`~p0P^(rAn~1QtL?uiKPST=u2`k=UIQr;eNI-y-QWs+PzM7+#P~iKkzDBoTeZg|?OS*$gB{dl1+b<)sj0-)_ zbBbY~1}9x$pCt}q-#lz=M&uuyP6%T2+3P{q5=CJpM)qGQ8~(KQnlJ0fd)vcw-qV0rcJ zWF0iefvv2cWb@t}@>2{jea3FimXyiZ`2;(NHM1h!2@h8bvaOp&>VaYx>nWki9$#oe z>{Bj+8-%-<$Q|ly3OW_A0u-*(q9TF_F>&;u)2ZeqV2__amqleEhNp9=Af@A^ao@6; zx5F6Gcu?1$=!dH~f{;~a`=L3!?Fs{@Ff~!g@O_=0FesOOu+qG!gnFWblq#6i8YDw< zx~`Bp>JuYj*_ce`Iv-VxIB;+r=+{+$3&}%++okI zMJX~GCHD|pTi83;fg#{g>1>FSldNDC+;Qfj?5V4UbIzjQj1Q7EsB}q4y9hEtsgn%I z;`@Oc+$yn}$KAV!3QQLvSgRW%g`NR# zg_ZYcsc>jSR^rmr5?Z@#o|qZ~T`f>|nmU?kXcj{5AigB?yPuvjt(8C|zk85KF_BmH)(1#PL;a?K53Op|>Dvyo&#~qOpg82M;b}0NrPjvX0ywB|a`j;}r_Zfq` zE7f63K2}0?a~ln>e)X2}*N?lg5;4EnHS;^tYHWDR~F~ow!rkTn8E%qug1r&F`hLb6htbVe~j(qizsrSxGiQZ`fX7yt$hX|^d{Av!p0r4+Y_ZFdYSNW{X|U+lj$TjHD;8MjBLcQrlLUS zgzQSYtfLD;OpadbL8R^?GE<4g(Pk{+>R=cT)f9b`>Cy1G`~APQ96;O zclnG#*I{GLCu!O7Uuc#pyj*ltJg8vgPcFEeEnY&oxr=?qmuin>dmfI$E!Z$%jA5#1 zp8fI#C1iY|*LO_fP1R^qPx7nOAMO6sMK639r)|?>0)Y701wnfPJl>n5(Ak-Vfn;mV z{QbrV5j4s2Aecl0k_-AHU%Cc{q%?_pm0cl@c|-#xzNV0rxryq60pgyS6-KFVYOXKL zwej3U{I;qGhqPoi_CIlCjqZf2#-<1w?1fl6+iIru12L zzBCt`6ZfYpdjkc2-z%YNA~Zc+N-Pik@*3ACEbCn{eMsh}+(#Kh!yKvTUfq_9LwLBt zTPP-CoQ>jC_q;3Wl2_YBAhjM=4af&T6-7-iihaoo&iBY#>J)JBT?sG7m|k3oG1l<4E=jJMY9Vpk zX*t$q3MN0&Qj;3AQlFNzN$2%(DmS-m{QPUMKO?O)kZ;wkf;xdH_g6y+Plq`}<;a#R}?vX(_39Q&%G4!2F(PvC=0KNI#%KrgOFma05Q z%^Uw!y@O7Sk|Sq@U)tT^t^qvNH+7Djb0~qNLXvNO)Iv-@D zm(@{nw-5{GfHpuYwiO)nx=7@Vcc7%0` zP_Wj^-kqI1LwwDSbAB=-ahjzrG}O-HlTD@YIk}q9DH_iceo**I+&rr@n8(rGD~55< za@_WMPb?Y@132qiF??GcZVwN)RLz=SWB$`9=(4}Tg+=1|8wVdFi+k}Wf~u=y+DP@h zXq}36M}XBFZ_?d*JVzu9wE*<_8oyJ;RrgESZ4BC=YV^H=J&gKzyLL@V8|}w|+@D{& zdh?f0)z2w19H~S0zhAJuJJRQuiN%y06BsN%Oi1{s3V5!4D#ke@t!4iy*0-5$jl<~I zSvevMTY2AY`9)T|tv%n*!2T<#bqkSqiIo1(95>qH+|}{rz)y-_++US{B9-rGJ>$DL zTpcS3Y0{T#z2ok+F?}m;L$Xw93ZQ?uc@(pD#-cyAb9NoAT0ipbzr3`Te=??|6z&Wy zeu8}M5p(Z?w%Q7(jz7~6AWT^s8^gV^)l(AX6d-|vN&ns=+yRSY$~Q?ZOM1IKN!fI!d_3_+>m;%_$o#HqQ2{S&ANO*SDqd7K@lp-3MkNG0CX-0Gq*YsZEfYm?uWHD znqn8tR4A!fKpkzRrvNfo)T2k>!%4~kJMkq}R=KH_B!^RnmpcGq1A^uc{+L;=wY3Dk zGV0ZpmJhsE&u=5KB7i$-@PW{L=4M%Y_-7h-eZ}V?6py$SI4(x3PR`sD2P))E+#WD0 z5!SKJmisXu-HJ{=rUvTTkjT_#A8x8Yr{-Rhl$81#aY)J|e^E)8rx23yah7dybJy`- zK1iq)bw^R}a;8tGY=wn+_W^Bbk|apReXAiYZNvqEAJ+zk%bHH?vnO8+=4U7AkvLmb zg8eOVp3qSg*C)ICd*(U;52oTDaQ0M7XEu~X^!2Hcsl`jVbvJpeRv9c4IJ+9U0*p`f zYMr<*Gi311^#G`!*ZK2J%6i0AMY3%cl1o0{e>U_w;-20i7GrY?4`koKhY300#64H> zw6SK3nYhsozMW`e>}y3`cuzLhF=ByVMs}7jjgu09faoKZ`uW`VE~oMA5PrrL=lu?n zpU&Q%f%oguZegipm=D(-=a@G+bP12bY8^zSbP4PyiQDw=RT;R2+FpW}{#BUgvn zjYReJDbs!xYg(ApgopC4xgX9|i>OYne_`0DVaTRk`e}~Hcph5Z^Mb99C^I0fkzBgc zw189(&6x4P)_?utv+bcN6DN#b)dOVDLLNwSb?AF7ZKhg7TK!-v<;Z;s#a2$g7dMUa zlqTk$F#QP5<|RXsVeDF!Pgeg-Dj{Ng*K@4n+upDVd)w%M1spn8>W2BkyC|zT6bfC9 zdIDQi)k2u&pwxLzOl^DD{hY)Ycb|-~ZbbUvD54QwFso{v?f`>}+g8vzRri6)zDZ%6 z%87#d(81*Eyj@i07rX{lOrQaALKYl_$@|J9DM{=j(lJbf5d{B57gBu7w-Tb98hZ#b z5AwbHpYOi)6(P6_`#D-^#1zJYMLKQ=Sn_FxaF>A z=|S0**OBVxDGVn4_SCV)JtIu<;?I-C(8o0$(V}?~IDk2^y3Xa$HcBM49KYqmLnw6> z#^^$lFtLQFRwL2Jof6`4C;)2){Keao;3&GK_>9 z`V&rEx$}SkzGI%R;E2Uy_2tv3U{Keml*amQL1jDBxbPOk+w5626cJG4E2>>ybXbbh z5qC3d;DBv5YW^--ih>YD*TGX{pcnfbvL(eH(YcV&r$~h_2)s;daV3frh=}|lF}r(b z3oy1Eh6@vp65g_O)t`vz?7RiXZI6QroA9`y^cph67Pu)ijvsrNC|KsT{+BY+8 zn7X8BJ>hmXe)=`BN&f42DpEt71Mh)j4@O1BdHm_6)Il8rlHw~I~9dn~1 z{Tj3c@eD8TbR{ubiJk&wMu7~UpYkp-EQbA-Puyv&5M$ue45VqMJ2i3*bGWBdMQEFk z52aE3x2#h*izR=6T}yVhoH#Sf>fY5AizlMxJa*l@U@1Z?la@hII=x7f0I*|X;~P#b zTfb99d9)Yo;g52%0&Y2fC>x3tPUrc(_}N!SL|%OzVN`pikY~P78uWUr7t!}XKS9D0 zTE@ccVI1%FTfb;bqLS*J@%CCrg09p;8sYm0#=}gx%aje(rMp_=2i0d@g74y6uhCaPd0~}xXS$y>^E#@3F=^xg|8{x1WB_x+(FZ5^RyM#|CoMpMzaJB)T1%xuPIwCd*oV!AlS9E7wtefh20)_WoBpJa+ps4 zi>14~PQ^3JDQIs+6#K(7|Epa$wa$1UrNR56{7UP?q08>PxgzD)b=j{g{rKMbSbk4@ zePOb-c%l+79I@j194o^05~OC(HLT;B(TT9dMBN9BV1*A`pW@syi_kxr#;dd9Z*EsU zdhHps+G zuAX-F2jT#Je1!LSlKckM);TwY4Bzb{*KWiGZ*|%@7Ts5aZ73q#G;Xf0}{=g{v&Ki%{GYbtoa~0Ftjm%iS zZ0k|NCA9B}B~H_+>>5{L=;7!A!%qO;{!rc2&Rt!jCcCieL0KY!AJFk{No> zAHE*i#yM3znDhlu9>RZTHu~CTwnd5z2|d_nE9Pdu*$(HC`q%~8qSW(yxLbb(Sq@W7 zWm<%*b;0*CPRHkfK&efW&Aih`2?4lXqJRdhMK#c|ox5j^@vP0KyzX?aHXe8%ORmW( zk0FE0_dXgWW4Wh#wQ1@4CkSj$wz#cwrWPeZJ6bn;e%4ul6jk=R%()3>ubg-(HNzQO zGa5}4P;_=dR@r%H@{8f%b6n=61Ki512)ZXfDDv(HCy56h8i(Xm=EKHNellI8bg(U# zCp z_80K;lrJ?qTRFk>FP9=SVGZUbA7Q2~-N@s~72npn{;%p&C`MmeN%V8(Z#tq#>C7rj zf5s~GdZAwL8G6yS>$==a2gd}bOWMtbaby=twBDF{ z**r(S+`>DZfN*GFUOiTv`ZwC>Plz$!tU>7xH@1WSdJ3Y259$K zs17u@D|O$Ldmm${iK7$wnG<}X%Q9bMm_-D7KP znpf&&feFXKo&l5B)DuN;K*D$SlmO0@5-L3jklM4OU%C-q_kzV*7q`PHjyMAOP_b{g z#TaOMYo+>)tUi2to!P-6sn?N~iglB#>&iFMliG3fV`f-w2;FlFBS45Mxa=bn-v4Xs zo0>CG6lG)E&cvM9wr$(CZQHiZiEVzdZQHo(W#6asAG)fmyBei6rpc5$BP$!+f8Agc zWs;&n{ zzoqn?co$o+wPLs5p$0P0U3Y}iO?r4cL`S3jm}eXMvr~?1Zf$bPd+hAx-;Xuf$p2bm z(;PAE&>3C9upA5obV_!W<|W3-7h4z2i#H$BSsY#=?rV4~GpF>AExY3)=3= zBZb3QGp-XBx$KZ8ncXM#7|vhYQSTV68NFN}%ZTZ2BNyrvDjN#|G$jjV0p@2y*>sc@ zlA7&7f*6Ew$}b30;<=sXc}f_d$c8FJe+J5GBD@7^Zj7>CLeP$=_03sU+TY7h@sX2s z2DXBJ{aebWz)<@l|D#^I=V+}v$8F{51h#=`iU_XvNLw%va|YZP6J1^8T%T*g)RtvQ z=<5}N%r()vxd3!26KB$34IAvNGHA8-L)^qgFpnC-JeY+*a!zy!TpKDKy5P4<%UBHuz2%x1)53JT@Jyb0~h^kYE8Qy&aKbqPOcOrhpX z$lqeGCtYv^2abN*z@Tglpt#~c2fr)W0tuC;-_L-j)DR-!Q{|92z(C zrOhbS7Tdh@oLmPqeGlGDM;xoFaG0s@v16>qXHn8&++d~>Bnd$V7{nmn&;6AaG$r`; zlb04rA;^=p&QeqX?$eqT$bbkjlEUy+5LGe)7vL`zb?T4c47>o3dgPr8=grjo$%7n< zbc>@M;G*>c)&W<>=06q{Zmb2&!e}wn3F>rjApxVsrTafE~cuZUr}Wa!)CY+MU=% zGBi;-rF?{7wy&brr%r*kUDfv$Y#{SKX2Vyj+&s^_M<^&j8x3P8e5H!-5)T)$Bs`1U_t*D!vg44zA^4_IBNWpg5C z=|@m5^nYe3M$AOVR7qEAEf>dtUPZ^~!^?+sGB6wQ zl)HkX92Dk-8|AjCvG0 zFTl=i#|iE8U8s@fzN;@$rWzwkGJL){Ck1-tdVyU!gZz9uQ@u(Zxn5&PM?wC~b?~8@ zE4~SOO8G^vU13u4}8ikg85&BO?@Baih^JVpE$Rdt3>dw2m#YHP zUDyy4A)3&6nCjtetosA`W=lsCbUcu*Pzj;ujlL`*4X#5|0H?S9t6jGj%l~HT7NIJS zUC1B~EQ3Z)KdW|~HMk`6#Rm<^aoZlL^5Liep!Xnu{*^A3Xry@9%agWKljK_5N*QM= zrVCxNb0*%LMylgQMiK;ozObijX!QQGYTR$%>z6oSr8RerjovyRRnA%K*97S?B!}oO zS1?!Fvvph{!uEbiEWH$G$BCnEehkv0LrK+qJYI$-jAu17KSC~+_8@zi`BQ=onQ_*# zxLi3B(bP|Ek(ojwEGt4*poC%KgmUc*wphZw?}aF+te~YN3@^_HOtHt={JzFYiNlQQ z16j?>TB!gYkdV53=~U*U+d>UI27&Le<>Iu1Pw5FIUJg2|C*fs`^|O}<$1Gm7=%zf& zm%Q|}dn#|foj7wd;DhqZ7@sIvO-8%v>LVx@8(YAs zF9~X??67CRaO{a#8JSTP7 z6N7S9?n(#dKK<*Mhvw5Kg#GnPnaM~e_G2-miu~KO_%wgKu64kE_A0};Hx=p_kpOMA6q76C^+d~ zvu?fl#m+}S7#=~m+Rf!jeF)*ojs|oV=|DKSjm}0u=mCMmhID~s=brKAwQw#w${w(F zLDIuj9R3l26JXTg>SqC8479~%g?H5EY})Ty3{CF@f?Kwc%lD4qmJGZ&u!-UkVHb4% zM2%3TNzI?<7F+DUtDuf!!9K3&nxE_hvFG!UG()P?P*ix#hKYBLteSTe7AJmf@F^f} zwQ?E?k079RqQu)j)odUX?Eyhbax<9CWP@E`A=@=4@>6ffeaQU)PQ6J){24*~STQFE z1o>l1bX)s%_X}F+cfQ98R*z!Q`Ns&=V%sy|I`osj4B9980rO3<@<4GI@meNkk%_DPiEP81s1V0Wp`Tli*Z4)}ka z9$oY+*oduy%a-?uaG8j|D`yT~v(6ya%hxl=?NlWr&y6d?dW5(lz53cHrJvPT^j%LG)oIVVWsCalGs#_$AnHb5y{^W|mej^Xcvk+cR$mE^byc?zR!aX z@lI>BQzvQsS{kKa`Jb-@R~&k=R#XqZFEAMLi2n7WR~w?=`{G8n5-nqov3R3sXt7zH zn(L-YZ|JPDc+IGWxhe7AoajUH`MmcWvaTg!zeO*w2t$u6zIBhT<`vUdx)7|Z3%8!p zE;tfO9C_ZfZJVg6x^inx*4KLOSHw)Y%4=F``#SznLYwijD5BtA7#{|1`2D$w)Yg7puqu^FD5`=DB z-nq)i3Yxp0prv7cu3WRqe6-uyUCZyb!@*FRZN+kHyaJm3HF7|y+>`4Z3AGVrWk1rX zZ(e2_){uV%gpO-T>s*RnX$MF zH{)^iFxashv2jcZhr3yuB{?Y$(KS@l+^DX17g=0C2YSX>Sm-v~(O}od+3?M)d!wl+ zG2BUw-CC$Yt7yRhEYOHyH9$g_^d;lcsMSIAu!AXJVCe012~o!S>ta^Zz8@eMKWXA7 z`wZ&@ozqy~atfdttD5v}y7)>*!cmYWE)?Rr=xM<`PL2AMowHIeu1z)s?#q0G2paoqc=v|cwWca_>t~h-i6LOUHiz9HB^Im{BC6gu<*^8_q{JqYad0U} z5JVoion`=?N}c86Z#(9$h&G@^b^mMBEl~I?4ga38z0*xjXAqJ9KBa%MGM3${Wfm2G zmR}olh1 zy-_M3dO9rz)_;LN1XGV`N9kj9Z}!-dmPlcz?>kWZq1_KF{rj zoqgW2mjX?XbVnUqi6B$8s_%BQg$W(nh8gQAxghl==+xxN>Wk4VX|9d-z7AZ>3s>~O z-JcvJu?r;_n=b=-u&!q{f{xKgNB87m+LPL)N*~$SqeKnPZ zqlc<7*Y9^@mp+^(X|WnMza}qT?Pg=v$l+u-c2IyLVQx^Ss&4ujmCJkGyQ)kChK1D7 z{_@b#oaPTfV6pLqE3qu2#-@`W4M1*2eHT??llYP?e>r=_o7Z@nM=(rUUt2UpE)imV z`wv&Ou^F$!R?~7tOfp(f2~Lo7oLeV?fFcXGamnL_5*s?X3R%`kcJz16w{Z(0RAaPw z5WhYNEPDRMVhA?px;?*71rhX1qK(U+{L*cCDu6C@95N-sA->J}Vpp&h$&gqlK%$RkCN!x$D z_<3VyeqB}%PCCo49M;c4=SwC&(c)pbVehQwNJW@#eL@L~XZ>!gs{hO_7hyWd5tilG z&g|4t22>;~FsoWpt%8S@uaskQG{m=f$C&Vu0Fwuqh9+iUX3kQfp^D)Q|)A#i!&k83a$Nn&6<9nbJ z7XK3f6y2O}3Yn1lvzQ439&bxe&I*o(Kkp}FTV9Z~Ni*mDPC^NxviBbpM zijR~$r+`Bq`Rna;5XJ(sqnD0J^OahL;F^;Q2JwcWoOmcNhiP-7^sJ=MOK;ufsYb2{ z>*$@TUw}(HgMV72!p?HXAW?&?Nfyc`o~?l_HM;a?D({w2MD4fB#{l}6vx9=%jZ}z- z;FM3~B_DK@)WzfMtp<8~Q5Jkz^b$zj<$GVQBPf0nm`dd19M|m9YVac`Z@t#`QviSEdG@9uungbj zv$Om2q(oq)u812Gzj9FSTyjr+;Q#$lLot!^CI)YC+eg7Y)N9a8J*eH&>LSgKC`NHD zrR-2ase0wiv?xY)O+=lo7dAzV-{t$RR?>^^XsCB^L1>U^Q^OT|6a^8Bm;55w z>7B56GDotv?iuia@J(7~B2{G9`6$$0uz%dp{>SbiY-ttx2spDOKSOMUkL91cFisWx z+(5H-g1-)#b=Ik#1vZsPIw$?`b!t`p8|)4tD}q0JH&YX`Zb%#Whobyl5Rqujd>X9L zb%mX%K^-W_&s_uT^YM&3@s5x55~;Ab6(F>Zc((UE=1=LO>ifct+p6fKMz(?G-xBqn zFoA`R#t=C$O{@2?Oc8g2mb;6Cn=>D&t?J6 zcz0B;=2YB=9)3AJFwK0zybfS@HZLCFvI&5eQNs-OEdMj$z207A@vO5f z#7b5G{Hb!iuJSb_G&l4#Vk=HR*E%8etmt2h4ZRq34BDR8Z{y7-XCTn3KDN^EdVH)z zK!$0Z;QOwi2xR^*o$Wj#jQT;gKa?3&{^X7r%b#c?$0=mXkw#S`;W9O|}Q<8X*gPW@7Xl|(6;I0ZEmTSgU9+OL0h$h95WH^6l~{RJpn zM_FQV^YxCB$xgDm&w+mhdwYi?BuSeGB3qMUe+$x72@>s|+dd5UhSx;LuEWTSFMP3~ zkLWj69VG!zGf<+wAWSGpMudDyy6#5{Cku$nDPbGvLl}kbg{4H6(}pRbX?Tn$Rt)kI zdy-d~!^h0(*VR*45a||Yow(+AE!6vR0(a=JGaCHTg6ibAlztL0(g4Cs&Kq^3&o{ zNP#ZIKRaJ?d2)~(B8e%*iB(C6pwq<;gBZU!!VDdUcl176whn(cF3elcI>8^R9_r!6 zS)jy`$J7WO`tA0|-R!!@XV1!HxWw@OGG&tp`h0oA0xX$T3dyIpoc zAHjl(7MR#a&%KZBrCS`HR!dg@AkZgA*`d)NLmUHnzf95aF8+9Gh=kglJh)K#DlZZ+ z=RV8HI6}=^Rg#sP>`}X??Ba*>k>Cw=su9oF;>J$mH~;*oX=q!8`|wy`_a3$_&6CBR zW?0~r+D_(%6|)x0`@SX-ka3=Ds{4ev$asd&)Cgu2U<+lRQG~@{2v_|lGSH7A!Lf(! z(`P%&nzFp>%NJu3EQc=im=3yXF2;T)%pHn3E(f)Q$1izJ+#iI(<)9yn;xZlT^==`) zoDyxiC0lfBlcq*>$Vw=8mFC)bpC*BJsVy$ew9E_i7_-5b8bwI+43vUMn1=XWodUg7 zFW95Nh(xl=cdUlx$mUl?W@WJzRq=u=$0dxsX|1e#9X3Llf~gp6rLZAB9@T&!LCyKe zr0^wJ3f$@=qG867!&c?9WmLy(5%XWVE*+Oh)?jmuRv&_9q!{Jnly&Bd9dhHVnZTG; zj}l9E@H-t>I@;xwd2Mt|UaipbkvK5b&I*q@B`Q?nntQraguNIx?1u5$qY5$!*ej@) zr}*IR1EgREa3(#26Y98$jm00*=QT&mGXva)z5ryAxZi)4WV3Xb&_Si8( zR*t(Bvzv#zUTWzUqCC9)yf>6&q1=6__4AryaokSQ0+o#$yQ3o&@1cH#a>HkwCm@#Y z`*bo~hGn8q0O{(8Mg z?bG#vEJrZ&R8p7pd7phZ@!t1{1{)?zMAe}Ph1)3L&BmvI9XHVAZmvS6jOOXym!~*c zcHAW_bkm>9ycWQa=s=_cFk=Hv?evQ7lbE(r3S=GTn80SQY#zA+}de>o^P$Gf{}Kk0mFj7glKA4qE=>oww$7DI4w1 zbT?>`oS)Ih+4g$0>f~&RF8N-sCfwbZrN}spZ=_e%xK6BnlqY$f4pe| zOM!`Ysm3Z<6}ZOU*7}Jm*}AekEvChevQuaGjFb!S3Aa~p>I+#X+d8WJXJj_jcO3K{ zUAPwr)?1kNx~>sirLg@PW?_!j#5U4mSGo*b(!cCi%C;WRkxIH9wBj`1%8dHjP&|CpV2$9z@khqC(1bhwpu+O0G&Mbp?^1L{md& z+rgj)E^DVBt~Kv9fC-zeLAG;U1FKCp>>D_|2C!cGSI4*PfU3!xX>QP|D-`K{RG5pQ zBSwjNq4j294AGwQmm{|ViTm;7N!1PwTigkLL2l2tb@UwT2^U} zfF*!9YcHsFcyZpl!M2Dpjgu%Jkm#1p>Omyj#k+ulTKu9u40*!`rYt%xqX8@ziy#gz z$>%U-!afjXf5I#{pVecos`B=_heEgJb%EvCQhZDeJ56r*i|>WQN- zIg-wgAsoTv0h_czIwbYO^uHLkHLpO5myP@Hcn-67!NAE0^W?fY46e zi}p}}7vSc33jVy9`&djbnQ{5O4qhZ~a(spVtD^&)Wu*Y*p0e1vViQj_n=aPa;`;eG z+tT;nA|}vjwgX~`J=W=lsY`12vEF?P?gsFxdsz4&ZF@3OB@k@wfjS0DqF^4E#Iobg z-oZ(v$c9FDss?}Ayq@q%`#5e28q+5uD5a5V1qdSVUM94tu_xRhtt4I=oA*Z4VOeg< zgs*-%26H=a&Fm4kcTr8;J#=PF2inBn^SHP{lq!5OCz*ZCOIlT)PcrJ<+AW6D%-$XO zfa5uZBk9~=Y&3jC1Ge^dnl&c~Tn?pnX(03D?gT^K5vBG7=BlYT>7Y1m1bRnI9`N79 zy4~L*P_XM}K6x?RT>4>4caLU5D*gVEST!kX87Y5pZ{=8pMTEHXEGfh>OteSf>NX*9 ze!*Tgwy0mwXy%nUL2n-+=kImhiij)p+i524SsbgA&#Jua-^CXD-y=+y^UrsX1_Mda$&M*jc`iEoD0mixRJVWaQVtNxDzw1*G>f+gHV7DE=YD^7c+Cm1L$wbX}Jpl z-mU5g_Df6a8tFux3Evn8j2mby+{k5^D{M3+44S%sW{o~4o>Xi<2A7_9DbXm^vkqwijNlHh?eIcW^WuX${)_*)TZ|IJxB+pY%ZxfY) zbP_5VGkZjf1-!(4aeULsCGU@`3e?HpJ*O$sxo7wDuP(A59l-&g$-%R)z5|U6O4?Mpta_BJW?S);}Yui zRah)XxA!7$nuMr}<~Aye=5vPkRd|i0R{8BlxghbKr56i;ovcTbA`#X_3)5zv1pDNv2 zzu!|5nHyLU)nu^y*@KnW`$Sx`mU3=Eup$6{?V|V5#g-%*eBn)?ibM2Y)Q*_sw4f8A z9Y}LXY1B=3T!F$8;W56+`k%4{2BtP&>~0kZnkc-_*jp9C&h~iZy_|j)Q7jM~RbOU= zvnov@Qe!DWMlu=xK@M9_F3vj$@JqJ-%Q*tbXDvB)gIh^@V=m$(3m*kLQr8XMU4kAS zuRH=wgm-|hEeAf#rHjqvqvzqEirhe~HCUlX;15^F{Qns^$o0mM0*bvv8C(NZ-hlf(aHe1UV|J z|3|$O)v}7E3JE~lKcpR;$qd5Uhx*5gxA0tU^XrJ$h+XSeyT^J+ed`{-&SAw74{Ext z+CKJ}#pk7_x5uTI2M-n}un--EZ(0wI8ux1Dfv!%NwIIk-cIVXF3<j#=XID1@6@M10#I)HR%1T^`he^e1xlJjQO3nunDI@v1G#c0|v?KEbO)?mxHb@9bnLus&KGo4kr2t^F-R3y0*=rk}tc$~9 zqwUe(cunrjR6hy_z>6eYG$ozbP5^%Lps4j%(?-S$W2OkF3a}D@3KNQU951h+QM{u) z$q~MrU4qf=tD@B7)=l4jz_7)Ajbn8^rS@jQjx?r_>j4ljNd?8X=SHH7;sxp7-qKg^ zM?l)Um?Zsks$cXyk{CK~u^wF=td?*A7Db*1LG51U|&4lR3f9t6OrEy5<2OYzORq^&= z+MizN+B0%!Erp?V$*YgBhV6b4{H}<$G_p8>=0{Pi0; z!_gfPr9$FcJ4P`VSRyjFR&~TqGaVE3qoRAw+=$j8@Gwes6*;+*{V83)pf`ls%sO@T zCV?#M$~8})Z%tFNLctqHB*6980*gSCh&1SsQgbVf(Ss11BWfzr!Jr&<2^QJwN&~Y2sji4 zM0}bvL0yxTdcW?vlj+t(qcY?6+{e}D!mz6Q{xK6>9skYYE8F2Y1yx$8zKY{rn*LNB z;WgM>@e(Gv_f7OzBn|2S-AjJJXob^7l}5^+{`+K~Bz<^}^`s7QPO(R{U1*-joLS%E$s3#CfpYg^P3mgT21fWb}y-L_38e;}@k% zw@j#Gr~7uDuCLD5UAONcP>HF( zL>M_bSocoUM14!)v=iyg*M7CArW&wN%1YCT zbX3fcYbn6a$M>BR66*ayH$|)Kl3$oNgE`2JLtKK!SDHDTy9G{RwJ?Yn&TAY|HbWa! zc)(qmxKB&FQva48fKkX$Zk4yr49wuW#r%zNy%qX~IN*aoEN5nRSdxi2LO9blSWD-! z8QzeUgf^ff+tL!e2&HH1-%A*rO8AXnd%X=?aZ4b7c*%agqv8|+=OH^%_`;%)m83>8hMApplsFb2@S5#=%TQs&V z7dMoO&^>I{nIaB1O0REP@3XpzKSYgOqMEZ4-BAp)u8l@_+0rK4`Ti(T3aj8N;8g!)B;z!Vd?uG|Z#%od!O3Nrc- zb1?5DVA|V+)o_`Dau2UH8ht*+RdS^Wd8Uo3u zJx3g`cYa%ao6`=LEDAV-t!O_kqKgsW7f6EM&rs`iDgBWhfnRt;)__2U8a^sVB+HBO zbmR{|=(Xbh?47~&esJ%EHN!>q;v}6=qx3Vqb|d z9z&ie#x`x>GyAcI&8*b?Z>97#3MJ)$BK<<9wJ%2g?=2I@q?hlH8RTJqxT=s8RlT$O z^IrL?poY$`1V3Yjowx2<_4`Xmg$26SV_w7aYrzC%3sI0LBhREFfWQNk;sL8Hk5lYlbB)TBZ{LoK=b@(K*SC0k9lo!VQ%v_e33$l(Ybe#H z|B45)Bg405--js;XAW?MQiJv?l{zrK0mSNJ3hL@ zBKylw@*E~A^*z3v?2UigvUbjoX=l{QVVXfcDux)bLr;Nr_#$oldE!2Wk-djh5&8#} zl6W6*}(whFf z6I7*3qH>t`NF-M|u#+OuR>MtXxJCpHyu(MV-W2L*poZH#*gzq^BpSJv0Yg5!K+~tK zWU+`+Hfj$9Vpgm92yL6vW|+?XYb`_mSLS#Qk4eQj$9;BmnEeZ8@`Miwru-C2YcgT! z_4mFm4fD2s>Mz*7qLyAB>JlAqROCqUuhz6Jh|eG(7D>Rs_JN!=A#Y74pWiJeZOJOI&*M*Vl@}}a@<>`l7S_PqYVXHW5F$Y zSKrqXb7X3+o%?ci;-AHlYEsx07fvM@?2SF~FgIwd6(nghTP6xm8rcma^X-&0(6KhO zkM?w8dNGJEk3TKiYCao8)jSOJ-qgydH49xnU-%0e_4&nL1IIkv#NR1pP$x$cB>y z29O(wR^$(J%lh_Vh2dJ zZy6J7H%Anw#u^vUbuLBc>{>gisuyFV*dR_k3sOod->(%HI#!Q5ZHKC+t$jp2SfLDZ z@_iY)hSQaTd1uY%ANtPwj%|cIsM9W3?a?AKrM&#CP=n$75&TVBPQL(z`#<01_5eoE+MRO8iS`$P<&FO@jd*+{29%bk|74GQ}_!c~E zIY#iHl+~YVe*Pk_cjakHADI#a`Gw}`oYN4qO(KUJ zn7rNi`2b?Hr-G9h+|y^3v#cU?FMf*^I9WkM_sNtgJjRa!v6__L4Y1zED~uX$W3cmm zAym-S>2=J#W0*ALKR9uu_*@RYF#k}|o~9Z<)y8c6-F&zgG|8(|^Ls6~yk9X5%@LMR z=bLlgxYC{5fFNI(j0J1$`2d35>C%Ej1)I_#vO(JMaw<9W?%L%Aiuh~&{($bnz#CEO51J9!lvbszn zl#Nrb$CsX;Rb;YrdOGR2*Jug%p|tQi%p=;x0`-?V zvM%4lDsuy}*eBaG3M*m^xyK?3*UUu9fSli7FHe=@dEG1dybJOgSG5O1&#g1Jv6tGj zK9q8p>P2gvS6ryvu!qW#X}mMZJ=?7QF#3>oF1&)M^J0`*)fj0r+0K&@-pQtD4%g9k z!H1UxWD1Tuz2e2hk%b-ln>@OJ3ohOsU+y!b}VESErs?D%)ti=u2>53Z837cuYW7AJ{R&)T-4 zG(7VA>pqBS7e+Q_JTJ!yVe3xFwfL_udjGninyGm%dfL0y%*g3c+&?q5Yu?3s z7rYLlDR~|A4e2YrBPbMf#IEE6=1%O_2zei5>cD>G)@voZiHkvK34oM~r3IM26sgt> zC?hf#L5)G`xfJ2PvRSrgggiPIeg@g+z5H$?n5aUQUN4c_K;lx-ul0cUsJ0q1|G7JE z1&WHATcpIQGT5S&K0-SOG_bV#!)#Lj;MO3xfAPv_joNzAZaevy!u+_qjf=Xn`5n4+ z$qN*iMDdN7=seb*x9a*So24u+w591G$coe+Ok}?RpoKnmL6U+ZY!-3&%qXTa(Z4hZ zb*#%q+gly*rC(YI>+p#_wZ0g~@J}yRJ5olfl1=bYrJh{{@)9*0)A+-N#{@i0)%?@% z?R!h++{wugAb6#=m)_2M3~Do-Onaf(D8wL!IIc5s7b9xz2B|gAZ~GE)s>mo#=3N$% z;m%uU{Dimds3=AL0pON8pS(UZLyR}8T(xck$`ZZKJjwNqHL*#0oT+JEyWx`HbR(x+ z)}>3hX0t(7%M9~K{Tz0aDwglSy{idv7mUSiO4qjE(~CZM7bs&f-UxJ`C_0Dy04;>3 zw2OQrDjXmzQVG)07E^i+4+W8G=M#71{!@^(AjLY9A*#nlMlb-_Zvk-tp~i^jw}NTcA=C)StlR}~ zwbdPYC^NEGOX~2{t7;c+P=y7KB*lMBHJ`T)_o`Dn7Qa0DF19w~OptK8Cj<glIswe24RzPRzvaFx2)G|O@L@J`Y-S$NF_ zepi>=PjoeITA|0ct)H!5w2s;#p~OdfmChr*Ke4BBFPhGQ=L$ByrfTa_iaIesGozfM zi72SYJUpvU|6QFlo!yCUyneNrn&GjmkKoDU;n`i417_e@Hm)$+KA=JD>Zh7W@9!>= z?7ImK`NT`$r%rzwZQ&%Gxh5!Hk`l(Ll2A!B1KMET6X5AKYFS<3R4JPgZa#1*Ol--I zA}Cb%Za8yy$Q2U{No`2!u%#_0Kv|dj;VR}0ZGK(ozZDVY`i~;Q01zh+;GYX3v!sQU zlZgX>S<=eD$wb`5$kx~dQBV-k(aFKYz#7qY#rm^??KTT~$AP*lTH?z40mWhUpxBH+ zX(AWx#IFtN((ZnVlcRnOP)-7V(Y z$6r0|W38@&y&MHv&#wAf78-U=9z+CCyy{$VowcjVJ~wr(x?Po7qAo5(RaDCad8~aS z#j41+(Oaz!v!o8$xs5@1A%!J4Nq*Ve)YQXFh$&N2maq_7vs=Gg3t+p`vLw}Tv0f&f z9^pv|C*UN$IlcYD7fRU>)ZHCew#JQ%KW*#yCkf#8;?mi?Q}*ko$Fr(v%KOR_e_eX- z3yT+#7WYY;y7QAvdbYNnQgJ{4YGJ^DXoJmxoI9GgRkWf)m(6wNpDPy!x&p}T#tIv) z*A}zQ2D2Tl`c*fg6UXit%~dZPX5@;?XOd?#j?Sk(qQrX<`ZlpvqKGLU{qXz(-^ZZ^ z!s0I118ZOudAMOFL5j~Aj+Xjk_Ml26-DD8e5YlFNhol0gJ=1nDCsv^9wz<2=iRA|+ z$5I5WEX#IUR4?2s|vLG=2Qn9T&1cX5Y~Cn`Hty z4l39O_$D>Y#9J!Z^K=$04*LwV8Qp8P+1=~69g!IXF{L~QwbVQZJL|XE7aF$NpCMcO zezQL|17?R!+w4wSm-rppm-wBUm#Ie)Q$KAXW`=&V3hmITP26qv`=EPdfuMW&7RdRS z`^~l^ruwJujR6d181BG+pFfa_y-OiM_ez`(+BM+#u6X~W(6>!+yBabnUGxwJ36TPu zRLVh_mLe^0oE>=Qt6LXr+A+&Lq5l~v@16bjIp3YPMCK!A?O|@4b-NO)dqqGOjn9V0 zf2sG){okm{{r^SPdDxnOPsyTCw#viF@!?faf2mnFtXLu{;sX6bDMS8J6@TTl z4DRRk{lxu5-;%lzNw!%dvF;i}>-^D3X7Jcjv*v@N2Xa+aS#ArFj!);eD(6!U$moNZ zouB+|b*)U?^F`;A+*$3v(_admF^@e|w$@u;A~1#^S9Vsryp9S z3#XnE8*eYSn{&^20TGey4RqB4nzuxxc`+?PM zVQsfMU!K%U@;$K`P`)cjL#vXbQ^eW~AMaq(B<#6}1_*^@zcNSCvia%EBZx^rx_VigjM|tuGsN zvxw(P){X?3k^^SJpj&~%=uiBpjjt5Gn{|5w3E|)cFA~0(HV&)gi&c*+io7o26}cQ< zzIPHEwc_B5m>9M0&(_@_&F^4#v9qV!u(P+)v9n7%?~z|1 z=1Eb{!Bz-Cy~zXv^g^e8s)kM_K+bn3>Pg*sFaI2=1l#L?lbbC1EX5pjALFGF`UZ}F z!5yv!BDK#fYC_zQ(Re{uiQx{*$QOEG++PqB?^WZ2z68)*o3uVvKHr_s6+8-a zVw%02yUccSOhb&S3rH=?$fz$BoDQ!JuXl**|ft&utxWhXm<0prDMdi^~?hKd2 z<8P;JNNOro17!@0ewN3~Z-$euzm(?C=U6TS5=h|a>n@fUUUCX00rMoqKvw=+zh0wq z(5HJ}n7&}0_s9_hTt&VR+LC>Oj4S_w3#06+41QU@iFW}~)F&NjiROjPX?~FJaLR&a z)(C~xb>N4K#w+e-Js9i`-swMZE~c^@z5KA`pto&I)YQ$hocBr9Vvt2RR~>&ZP+711 z!{g^(ye#vD(i)tJ1@pt{!}135411#EnUf?*g)b`AFb-mNP9?eQ<2<-Gef~Q?8w0(G zaWB56vP-_0^tfZ$je!1u9*3?&LCS%=tPd52_eQ((-1$}B1YS3*7x>cl3A8kZF*$wx zfe(@Rg1SGgmnaa3sXN?_Pj340uY$Zd+Tnl;`iO%U=n!DVdv(8tlT;*EG+VW!Qd}||ANV1(D(~FEF6%Pg~jDBSp0>5c>T+4 z_x~(5e_O_X!S*j00h#4Y+`id57(3EJF6SS8TG-l%LsTz-R-BiOg^iPim6aXh$;ts@ zWC78zu+TvK6l{(ECl|1Tft{U+F+@TeSUH*?GJgU~XfR1STUi+z*x3Ay1=!re5diu9 zGbRiGH4_I%NY?;%CRPqsP98QcPIg8%Ca!-43vz&o20-4z#RLGkDt2aZ6GICF8z8{a z&cMms(Zm4Y@_&!#Vq#~4gchRKO>B&94Zk9?{oAboyV;oln3W8_sybOCGAjbux&LZ? z2S+CW8yBQ0Z(?KSWDa2EgxtS>EdTS9K!>`fEg?6$_hR*C{+j66lLi2Olfou`CCrjf zomgOaMoSmGP`2{f@a@T4tV)seQcStKXn2eJE;b`ko@JumI9i4()ojKBC*@=Go_uPC z%m}#Z$4GsnJVB9wa1uThZ!mt2iT(l$yIToR zsKPfrjlRXOfKKY*+f(FkhUl~4PutB5)thvu@8FA^GR)dkh*{)~H&~sI8Cr ztQcme_(geo3vRvuNUbozu*LgaQBwY3^=WRLvR8(c;pu6fxD5*c(%5vIKlb&46=ciS zuiQ=SWY&B0FQ>ako&LxrMooQv*?fnkB30lh81Hspox5&2(}DcexhTp9WA0wQ3*Mq* zJmE95{|bM;`#$wwJMe#7UyW=jov z>wJ*Qk>SsT`Q2psvyTUpkgvgCU%1Ko2?-uK#?8&mo0q+`0^Mo1GlIi|FN=QI_J&ez zW(B*K=I3nP;!8XZH0rva*Zn-!As{Lk5JEND$H_;BFUg5L(lK??>haAd=6?Ux;5`Y4 z@{Du^!c_eXo&GqQOrKN~Nj^d8w*K$t$*B*E&_J$k{b1XkAmAE606FZS^ z3Nvgnf4}pDJG7bTvYmr^GOv~a^g)_ZczM)LauXKv2|7@qn$qse`4&IR}q8RD2MxV)C_eZ;`8XC9Bs2g1uW>O8n1Ex^=D{L-jFo~Q+*G_DfzwW>x6dAS-FNAU6$Q!rAO&rYqE!DHtB_! zt4IIFm63O^(1ofuj#t;Vx~p?|Fxq_%fjVYnQDJtY zrSkJCN>V!lg+bA~?-Z>N6(cFDmT!i4CUwA zY~NM|hggg`;%BUn8RmXT-&yaYle`(PPHUzlA2sdL!>zeY+b|Yqr|gxHkV7_fb~$^SkvWyo!l-)MSUMwMQg+`` z`ufKC*>*{`d_m1K&1(6GbcujuMK=2#rx((AY5w_z(5S&$W3hJ7mBD(X@`AG^s#m7? zgYkqzO>EsJ10j)S;>^h1p(WiUjvhMW;_r(4Xr&?)h`iTz7?tALQ1vN4p~x`o0rtTb zPc9m^eoD(Q->X-2ninPias)p*X~21y#r>jLH)d$Htmro|!GvU4&h%(u@zEfBQ^hCq z@j5DrZ-YEHD2YxYlQg->q^{78c=;EXRfkYgW1>!OUh9ol)Hj1>T4_iBvk>rsS6_pSdWfx@;6Xz1=VPlh! zkQCu&6_a4+;^Gkm{Le0sUjCKr*#7qt9ebh$qxTb*nDXy*ue#-L$Xq<^U@=S%VT5{d z40Lps_xM5GFFn)oc{IOqv394TkR``tjnJe&iD0wOeHtP{;i&2UXuW%w6M^C`T_YI` zd`!{z&5d__=Dhk%aK04N_S`;P&31;}xRfg#*0BNi;4w!!jDf=SpRsjxGH`Hmb$~>Z R6U55JhDb#vp(Kg;e*k7q1KI!p literal 0 HcmV?d00001 diff --git a/examples/notebooks/pdf-processing-1/input/lorem.md b/examples/notebooks/pdf-processing-1/input/lorem.md new file mode 100644 index 000000000..35723ccaa --- /dev/null +++ b/examples/notebooks/pdf-processing-1/input/lorem.md @@ -0,0 +1,3 @@ +Lorem ipsum +Lorem ipsum +Lorem ipsum \ No newline at end of file diff --git a/examples/notebooks/pdf-processing-1/input/lorem.pdf b/examples/notebooks/pdf-processing-1/input/lorem.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b2807a44d1d0d2fad2a5c88aae1061306eb83839 GIT binary patch literal 25723 zcmagFLy#~GtS#8KZQHhO+qP}nw)<<_wr$(C{r>xA?rIj3MJie4oJyrCsU)NdB4V_R zbgWRM^UEXaP^<(D1olQ&P&_1 zpiIlmyu`%Fq6#UY6tVg|Mk_h9y2i%FvNAzONf*krmqV&4W^Aq_(6{?Sg_)&!jhdB0 zb(Wf)dZC>n9Kk9sw}&|Z0=%zK>Oq;>nf$+z|FixFH6zpi2Rt*jr|fSx96V}m49j_1vr0|TM@^v=ouz>lC~6}p=R6ZLqaBc`QL2?K zjKGY{i?krKsihD&AT%&C5;iYLXdTev+{6$XiO~hf378{i8t`#c4!;E-Q$|MT9tFUW z0OI@}kp&Q=1K1=`E{y8dO~4Ew^2>`HyUn<1*W*e7gdOG*_ zazIM{PranQXon&4UoY6>hqu(Kw%>YKuo|{*jhQDhEf2u z98f_OO;rgjf_kEgib9#$hege;wW-PV9iL`!lB)911prZy6IBsFAr_9HwzB$hO&^uu z9Jmh?S3vHB_;)QQezG^nBmqVQu}dU+g0g=Du&9Pf8Zo z#`YHvpn%EUsqx6n#KqOsl)>4}#i-tu!MMKZ7dt|c_yePfp#{hhI|M>LT{Z;=5MAW1mFh}MZz>JO!U@2TeRFG`2m5mL* z9QWX_NPhS*n4DZ2+RbLZ9QxPa z*GzyM02#mn9-tF5oB5aZ-oEGi#O!zKM=+$VEVl-?e|dfr@!ZxB>f0mm&4tAU5GV&H zCvZ3K59{3>IxiF0z{v3WneZk42I$Y-T{5A9HsA{X7M|n}<|iJa{CBr<8UJf}7|rAc zjsY-hka=>0%U2@k@-IL46~AdXr>5ov#x_9ZFMR1whNZco-tF&g`A&;H2T zxtZ}_{CYS|Y>j|`xH!NT|6~BZ$T#>Q{OjK!v_XoR5?Ufk$xmP72e&+miH)@pAQN!C zk103?7Y9b?5VNmE;K`MG0QpmRy=;J;e>7_Ty<=xCf zL%?729~U)%KmqnVhM(!e>brJ=sg$!sX1v9K3?~XG`j90lTYi`d&au>|3^`z}#4Q_T zLf?Gf3Vm?nyqZ{$_8R#FzDl(qVuu#C`tFLttMZNJk@s)4@4HZXT@n`{70%6y+ z`mU^$FG*^0o#x;xb1`piE=B>DAI`O`Jc}O<^lai;FyQ3MbnhL zPBpt;77$e*kdlbJ{|kPpMxt}sMuazoX-J0aeOvnjQM;dX5hCcY*>SGK3jc(YoZ0*i zC%;YaD!$c*Mch!I?GPVJ4Mx8h2~seVoEHYcD728?9A?SBXh(A5x7a`^)-gNeE#{2E z3t!>HA(E!kh~J`7Uu8D}Hw@K?DG)@cG_;^OZHQ&ra@5?!8%IR#-VobOAZ~&R+McD; zYph*kuzq1{OW|R5YPO-%(bBcQ(boO?r%0p-akvS;>W( zfNyOliT|YuYuOJfP-sb6bacayEETJp<^}o;Lr>HFhI?z`%`?N<`4pKOCXMv&+#Z=R z?BE553Y++=TsnCROM>&t{aA!CwX1FgyVPYVxmaJD(%TBYa=|$3gw3*OWsOMp^vGVV zRPTvNFNu+duP>AvN>prK8t4$PQ1-ZTQO{K7%4qV@y7H}^72QiZ`EKRm@@*PZUY@q^+eJ#>V0b}Lh?^$skN%#c`l7p*JWj&2V3i{YiBSGzpC>)>zlLZLcDam+u5@3d{> z4euK*t%BR2sore2pAL_ka-?77m1$8NXI_bn&`=RSLgfZXY0LwRzJZCKFaqq9c5=9Q z2jGv3mdJAMN$jX}fj_=Z7npA|C=r~R**f=i{C**Vjlt}ZbWRR{Qg0&V1bhTZd`?YD zPwgKQc(@Yq3>jOHh{?evPf6znw@>vV9V8^mUluK!6)meKXR#d3ZZ&~lRY69AR|`vl z)Nm_d{b_ZOrM`8Ug35UVmZl$qp?}%e_s>WSTV%u33v~E)ze67}mj759?OB(Qx2-6( zipbQ0yU)lHZO`2^~FxspwmKVnJOJjXfwY)5&VwMJPJK`h;^av*NEtm7`8O zZGQw_*qE(Db1!X|eXN2Yh>Jx$j%rP^zpi-%QL-VJCq1=BlR_M|WEcg@`4yD$)whdE zCJs@Tr!f(S%TvsMR2yxzzTYe_Uh=;p_no7%2+*Z!r=66FcH7r4j_r9$f>4Z1XZHa2 z5W}jQ?j{mHEp5T-=i;AX8ydJbek6kKF0s$PYwu&L@{tK!YcwO<7__47hpuz4y9} z_rzCa4s^AM8S@q*h2U1%G?qek@kD_lC^AixXty(d%}raxnsxm+S+2ow_XB)ZwSRp! z=#-LyK;0wSJ}8iehSF?grm-;Rbnsx2;K|7@icpjSLCdT9OF&_7GCFr z?ncPP9WkQj39%aSO1|vSlT59)7N9JPH@PCke(*mWhTF};{erg@rmnO-*+GLG9 zfFAk{Gs;W-Cz@=?M!UL2|F4>Eaqor6bg2y0GAI7RByvr*nvs?tk?$%1ADaxTAU78~IKLN{w~xCB?}5hGsWzKBr2}5iy6v z?bN@WUb)xaf+^YMVjhCJBwRWGKi*aTt)4kHXzie9(S+Qtu0(c9E%VA9q91ZvslPd? zr7Y|+AhCgF76lIF^Xq0YHyabVh%TVR=rrR@nQCzDefCK5OK#z5UZSE6?7-wK*bJ&f@reEIs&BYUgZRAnd$nHqYc#01aF9wZzF~D-HnSpiaB6tV zxSW+iZRj!>C7NrzyJue1bJV5HWJISt zbd0}C(~QSJudMIiSI9)VmC&zux-5ng%C31@Oake%Pl;E9{*XbrrIg=FEa0 zMu?FCR1pw}PD?v5OtMrT#~hm5N6c243lMI@GqM2$yPKE4bKH3&>fJ{{QKO@~G440EI*deae*kWEqRZ2QC>Ka7ho!lzCku^IH7~)f}=ub4FL^SWhbo`zNom&bui0)|l zC5300ka1WexD&Zmje45{Po{+xB5gQltZW7TW?6YSdC<#8aF>p z|FiHNx`Xfl9DST%l;JWi_Ri(KdbPN)MK&0c@e9|on^P52xoK3@Fxo;R9J~@wien?f}=@&FFYjv7J^#rB%Rj>c* zoi8v<7_4Q-r7Hv3IA%G2K;zbUu4Y0ZsEYiPkzwnHqErgkbi;kYnxQCJc`RAI>BLHW z0GZuCY+r8|U=_;H{Mb4SYTQ|$nE^WXn6zjTI$tTSQ@EsiYDoVVM$*6R3R@BF;;hO@_{te4apm&2BczAeXYax<93Jj8{R16R+(6=MYvDVDOa zPY5?2OvbXe$Rj%QC{zr+L)1DTDf39wRse=hp;Y~t#WMKOg3Vu`0`uY+dWe?d@ncmOy@(~rFvkW zc6(I{AJYY1(8rt0Rh^&8KJCUP5A+8B{X>h13jw4 z;rp{!qx@KM=1HPHP0neP-`c0774e)3A1W_+qDY}qI4(GRWA(>kDvvH$YWn!5o&HSe zr|>^@*=qn-t%b0+H% zCN%r_d)HJgVm|ED++-C*dg2XaI+Bh)<^Z{7Zx%fpbMj`Y>Ig^>3jp zk4dC`oWk7%@TW+;;8B^pWi>)I;zz_~*8=cS6t{lMfK=983o-~F?dpBw71l0I>LCjJ zQpFn7aWI(`7%?%Mu9V+?8AJnT6J+YC!}Y-0UNuNc$Y4m_dnHifkHT#~Hq zDYr}1{qBvFnmtl7deHf{dI#3|WWz-4s*BhV%$+vyOBNI?!O}OWi+a?&Ozn_VMoFfY zG)QlOgn}uvyrH`}5ZT?!AV^aNyALHco2jhsn8bV|iy|z-l8U1hp_>nVOXzfk?%fiE zLU-o-@+%=J6kO!leXDT8Ne@0}Y)Aeg!yq^TLIj}0Q|Ez>$AR@tC7Au4Pv*;y)8cqe zFrTY)*m^P%=|3&gZ5L8Q6L7h&`S5F!kWgG0dUF7nEQIoU7)18m{Z%KT+x%z<=T=Qr z58fhP2d=Y#_?YmUfN6%G8kKh(jKNibV40pjemNFdu}v(Bp>Nx8gY#$E5^5|EbQP@A z8BfwI-e0xJFqY6(UG%YZSPLkYahp39EQ7`Kg1Q~XDb|m};mH|Hov=9Lv%{RdFcngU+M}BPv%-IAxVz4#qq5+YB9hLQ_u22do=L zugk=SwGJ6qQG5VTX3a{#yxshPRpSO#H*AEGMqyusJ=nIr7Y{mzTf&qcjiVx5Gyt}P zgETTOn*ghGBF|=cZ+1aMxyNfer3}`P0SHN=jjB(+Bq?58?kG7&vRytIWe8t zODaWFcWgz7`f8wEV-Gu9 z)f>8Q^WvV&beagUD{Gu`?r|vMC%5 z%z_9FO&Q2Voxh0|pO``x-3ZE1&6tFP6ZWR)v>Ns<#n$^Ldzl z$_cv82(E2V>mx#$a1JgK-?@m3;Ys2En&d4$0A$-^TXqf|)p0Z6M?`cM_HD$BElJMe zeOg2sZUv-XTd4QidrltyRPD*vPaJMr=5B6{hcaB=ja;B-GDDAf>6!Jp6nVV{B$Pjj zkAtG<*;)dr(u^CxT{U8cGbyQ@u?e#ve`H}rjJSU<(riWU5 zG0vg=;=#TU8EYZT*eozlNq?{aJ;;&?~w1?OMNHwcf89k-M)Zv$!vXWfoa*) zOHzwxqR<5qg?-Zw8J5T@G)uo2OiUEA2n%#{WED%b+jn3Hq*vC5`Bzz1XgAvXZ0LdM z6m5F6kiT(wAQ{n%&pP%4VH}@GY}SpHW5Jb@_tkqenJ`@R7}BHiI}wHcVrYI>BKgwT zlo(w5IAZyAP?U8RbXQu`v`a`_kWuu?wL<4mjr}u5g~m7iWFDJ zBTpV<=_xCbxF~6gA#-t@z>HJS!>9=e(j~D~qupB8w&_cYkjVNG#vfVQx4>nn^zsEB z`h*hc6YnbJdKFD~eddXQV&I-`)#e(z1}I}%13!^oEH|trVq|7;R=eSIjv!-a#qD`( zP$8*$4|bm?K>Jt!1FqUOgv%o%Cl~1j6!|_mjm5Yrp zlU8LvL)P6kw^q1Wvi_qn%>0lp*E2Mo2Zt;6E&=h?!JOr)>F_QGnmGTU7U?|WywEPG z=LE!-si=_Xs|d&<@3{)4TRklKu=+AFwLd$%4`nz*7iHVWT0z5iXC zk?z>Tu&6Q{31rZ;k3Q~JafV{{MajT!|1C{^o{GF>>LXiY?xMj>^o3ImEs`Qgt79t9HIz%5IF1u9qjriO#> z&3E_!AQLny4yTxy@7}kVvKYkGflWaH(KQVi8$qf2qpOBNjaA}e@T?{aQ8smdCa{TEuY|) z>VEs1XCiOi7VX$2Ui*NuKgz*bI{0qQXYW$C8(zMy70SQ+@i5)-$jA1!!DzVjFh65& zvK`Coc@$`5CeR5wzC2)uv`y`GO{?3A0)wpVe$r$_kWG?7$P$(1T95j4_|u z(?jQ`vkN60O6|SH=0?#|hY{FNi}mJg=}COHR_rMX-doe?t=wJT`J)d-Uss(X;F!1` zlD5$s@@^M?_7j)b^2=Ip(bcyl6_!SVUiuY3t!n_hvFm00cdAPrtE?_=2(}hs`w?BM zqTdb5`&D~A;9&L@E`D^GuVQT1@^SZu%f>Tc`+C>3AFABD!EHe%-3q&7j(c0Fe+oxq zJD_YL^oY{?9Bm2|Q*au6>2E>CRviT^_4n5e4SD~_xL)#coHN@=YYgv4lKmu0$*c4D}FG10eXGv)Y>v`H6(qZr+L)zsv;H zy~LZIkEZJsgGdJa52|`NeUb_ba95eDVgP+luHufFa_47;h<)vJg#&fOw9Q$aIB14! z>YFx=bbbxY9kGK_;_}{&H|?!}P#Cb<^QneowQy;oALqCawSo~~uYd>mkvPDU5ovnU zZUtyI?J-HuuG0&Rn7*a+S3$%AKM&9V$u@VbNwB*B=es6PHWZNd1BT9@vRoTnW+nxVFAT4#tvJY#d9%RUx5tx z)*`o|wXwH*silxwGn5=}(4^BdRw`zftjRmUPvX2B@$sEgYbfwli(KsPZj1FKG_Mu7 zP0@cV;oJ3*o(KS&kvhrqMs!3y?aYWIsAxTpF{9;g zuc_=jE@za46HB4QK~Nda4vnb5TfruU_|o?_BdtYZ-u2(3e)-FC5Lji@{Yd`(>vmlW zmxWz*aV?}2XshRDP8d$ksK2IWBTc_39JEqem6{48mGryHfDDQEd!OL^wq4-C)$-LG zm$XrjQygO*2()!;Fq`I3O!J4|5h5@`y~eF6!>g=GD7=Fc)R0#*k&Z)|e0@iri@>If zUKTml<3AdpJVJ(?%Fwv5D6+%HK6CU+#27d-9H?UwHHmFV(!6w+eDQVoT>N?(OkKI6 zhUQo8+e&_$XTvf93>@n+0$O1(>b$cC?Bw$c>QkSH$SbDpQsVMYzo_8r&*PRSnv4luPKA#_Gi2x`M$M| zHyeN~w0Tmf88}%yK+6@JAkxR^{ef<({woMnqJ5VFJSmq6wB3bmOO-)Z7^Rqdr(7SU z|0A-pk(SNN`zwD;i5B2R?gHm>YEmy9EQ=&H|B`XO&IBN)#2QJaBsa-{zQxGR|QXhd+T0@8dPyH-k5w_`TKK3d(_HVj~ zPdL*sq0Dwcx$d=?-E|&b#`396)L8Dr;`8yKKoI8c@pH^bJTP*^)0q#c`Q^g1!lB5W zd*Sfy21;*YyVEQEBL?F-`e#) zh`Io9Mbl`zW9J;T*}gKJ%8zF47f5U3;aU<4E)6Vs-?|ChQ9%cCd7#BayuT)f&`{AO zYHVsjukxRsba0%fNt1Q{g&^Tp4oDPo-7&j)WsQox%YIU4MH>QZzD}mB*lq4GJb(nN zWQm#0x5sITWv$lLE>~sCb|h1>g!3cnPOCC3ixb@T9?PqOK-5<$Ol{D6L~WpOi)^9l zC%E1I1n49?679@B&MVbJ1D-OU)@q?^Y3d@czm;SKmMK%;-4gMRt2KVITkS-oKCKZ> z@|F$V#9vMeKl~|&%Vz>EpnWGJ(_>gYZ0~9#&W~XK`H_r!aQ~bd@k?2uD$8ycIqa5A zf5z{J4$+|ot4*T?KQYyAugKm*ckt|zYq)ny)E5#PIj zX2d%GsFm3Lk=>P9ZWk6zhz;b1C7U&O;8L_3^Ny;l(Is zZ48Y*mmk3zv!|l+CK6O@K1WPK6O9JTT0PaXg+l!~iYr`D3q}jP8Vyy<_cXeDIpDmn zbVQ31fI&2NEG3{H6kJ2F9ddVg#iQ&Gb>Y#hK2d^}4>->GXtt{3vfOA-fE3gnvTL0e zZ2FM$(fWYyCAn7u7q$bobb%1RYkXhDp>Tu(LC3#r^AI+-2v{O`S`2faYvT_NiQ(i< zy_|GtJ4eY?}` zmoShj)L1m-wkdx77T(?Qp5T^CtB2vkf%ZbaTIEW_>?d`!U0~kpgPI1Zl4a-^GS2wA z*r%}R>B_t$aa(!cNXf!`sHpYP2hDHmP!K{sf8tg= zN$uvMBg@(q49Nf&lBwBZ5~1-{0x=ptm(SLb;g!iSAH z9cqIfI~~zT6^bXhQdbN%fQ_au@*&lUlMw6)ta3dim+g`uTwl0*Gh*R&W6#hS$haD$ z9zzsPw84R^Z!EH11hg@U3lEonkWXU}+w>&)J&e_qX7AoqYPm6-_=df*c*A~s(whi| zQ?Z<CC-)1?GprZ*-r51VDKSS$9z%ELSB&Xr}fyxOei@fI#IPolHtH6`(aKVmI~YA zhc#KUVryxGN#x@~{qBt*y7DbZI#z5)iL7m|@=|>G3*n0EcAqxv!fCoT&+*XBbD02= zW^E=3a1Ashru=ypE-Gp+3T<$oYXqu?i8>YJxgPCFBdKAf&YV7;DQdF~1#Y{@s~?vP z!agxLu%4s{2i2`nNXE%@ngW(kFQZiVM@g{0ICB4ipkiBzU%Z~`8%$TfugMAkQe1`f zmKjAgLHXATn1z#c2)BH8&}ki5SdH)PGsj?f0Bib4}9l67OKAr2;dLjf;^btU)8n^ zBvx*8x7aIT+E=hnex7$h0h`y?<{b%HugF+|mi6nDgjDGpD_)8aA_6yc&Ld;&ypWTl zO0}Rt@}!7GgaLC3HV$8xJThj*(Wfxe&bd9Bv!TaEx|U(`6f+wZgr8^;*%yrT&Zn=} zOWyY)6jg|ycaCyZ!ytcNXrt>74n*%wQ|UM8?T!v^pBTKc!J>-Gi!>J!Z{UHlyC z{&O=!SJeb9c<-y3&k^(q#Tk?Ni{~LMCZb~n#eCqLAiNb*?yeooN+9N*G`8I0_kj!p z5N!r=eu|7wAT%EY40Eu1$X>`m8(jl#rd{D1_o@o^*#!9kW#+d0YofEx-~4CtJSHK> zEnvWEplqnxi~(x4$4qsy^_zt9p#b}tAlg}`CFSseWaF%XBK}l1$eTD%$Qn9f?_h_F zXIkQz!DVw}l4Hmm4&@3JSg9Mjvb>*3$^0#Pm_j|D!UPz;BHfibu$wnH7Fh)`gtL#c z#}$(s&wDusV_b{*J3geR&vf|Q3Kt418#j|-Nz zE_8N@OJ`amu|8~#<6KU3n?|Kkx9 z*WM308^A%iaO?)gxn`-Qf&h2`_n)t1<>jI(VAooc>Lb&OYYU5^{dU>uiz{id0llx= z;0gyCt(^^5$;A3Kkk6)yPRJR-(Z6ZqwLYoJTlaA%b!#=%T3dRilksml+e9bs=b287 zH#X0k+CVB=X}e5JbsK2d=__pj4j4=;tmn{)D@+H#Us@5HL%gWa15u#E*-kg8>{QGh;#9G9Pmbr2D~UWJTK6da|E#C*s5KacvcJ z8p;UTs>5Yy0`DeK8?ZB=tfx5i7IefqogcuyZW!!qcsVI;^L|Rh%Z7Il{iQ`p`L~&a z|Bh&O1a}S|!u_Cjc6RIdGuxwWZv#?7LtF=8)k?&P_zVzrS2HQ!$?QIhSAzP{)n(_WGL;?-#6 zb;U^Or1$==dQ3%>9PvZi7b4QKu8D`B^rg7M8Z6e&V}x~poZpvmMPxUvQr9RmARuoD z;5+RmTN_7=A^XY$n8iO+l-Wp%Ias;We-dRw=U$o-T_};LyM2vg5Un>mvmZZSqQ8yo zq(9b6=wg*9n{EATn^McR!FmIupt zVe1XZU)T?Ozd_2YTHLctyhz1yz_@~$Z+l4|&dp}=ddnK}>qb}=A?Y~zG~mKl3-0J)r+E^oAS;i@aUtJKdREVtVe zrvV#0E}zh@H9~dL&bpgjs!0Twjeg^E&Mr=dfOITIP-~3E`*Atf3*dNU;En7DC~JzU zJ+0?Nr|S0VVa_cY<1&oHw&{bN4<{c>g%ppke!JF8#s#H--O4gXG)#>A%a) zKQ)quEUQg86}M#X{vtr;N=RQrm~J!B;!AYh=(r`88_e`1gMNnfm;pzD@00Z@IPD7E zHFm@yp$OtaA>ceBVv%YOXQLrO-l&gl=oc|VXMrl(qPG<1- zy$(Cwd%YDf^Ta12XE7b`QhzQ$5S zT}sSG-}n2^^-=K^BzSQhvN;|{>b3OW$*r}X9D`Yr^MCmtdt*+pqr^+Z)2l(@tAUAa z_pI+!`k{e#{VeX~7hP8rXr9l}7kRdUvb=4$A^J^#Ni+o5~*QgAe*P{nA6fB&h*wZR-&wG@5cZf^UqDzDe}!k4cxIf^71v%d=& zskutSO``&bcZMU6(5J;-+uYKS{tRi?(I#C;UOYv}Kl=miAZS?uQ<9rP{?tVqGI)I3 zupt9BJ8J1^J~)%m91CYsXMe*VaD8C*_ws|5oOqyU0s|CT%(3%sgkq5+%cR_m(|yL$ zCt!DE>4{vLT|eiyg=3pJMy8c{dg+;)(Gw6v{$Ra{@rX%TH?TfrZUP6d$981EgM#)A z{!XzaFKF1MQnT~!nvNLMhY4r<^u*CllW!FXI7+6gfNfg!_f(;rvPNBz>KJ|mUrA8NOEt!y|Onn5`6ZJQ*u3_ z#L!{^Ov}@6cw;(;2W(&c0lH^UjbH)c>iSd1-@HSndnb^sS_LGlVe}vr2bUc7A;Ha$T8-G0e zI9n=gcH^rV=iM|xX&M1tUGh2N9NwSED8mt((Y+fxJ4#U|R>TIXkmAneg0 zF*V!mBSO?up*kxrsp26_7%w2YSr&1rq!NiHCak-Orc7kbF;zyR70-9Z$ei-A&J}ZI z#>_}A^c(+&G({K;_4hI3fWmbQRys<;^w`m6Ozdj!&-J;m*1T5fW*8m-NVs+Ujg z-?G?%|Ht5pp$_dDvF}_#GNxC*)A~Wu5uhIn#1t(lP>bsN|Q+C}1ir0YWym z_LB6HVciG!MaXs4ovm)$mww_0&lDT#-vAbwtB}gKgbk$Tc_9?n^ehx0A1&}ZIw~Su z4p9y#=P}ADX^2Wzl(BSE7wOLcyRJP@W>%e(jH29FdK8t+DX{(N%pKHlD!1U6})E=(JoQ*(Ngeo4*ssXD__cJ|U|(7WL^y z^S-&7o(nPGKCvg6g+>34H4MmU!MuITI<12p`4(|(+yET)71k0em%Z5{Pu~0Dy40Ss z%7qpIVt>id)!|z`sti*(!dvKd|M`=lR)-1dx_Yku1-_u*NJs75MWu@t;!_nAga<2$ zuVXPO%yd?XgamsRiQcx=*v_dB=HEgyCb5{W!_wj`*haB}h}@i%%2JF!r=;PUe8-=8 zaZW|xE>iop4N)jkmht$Ox1+Rw4c2?Mqj^BB{(J+p5z8BK6it|0cT~^k>K06Le8t)% zi2UXzm}q%c9=!>@Sd6-)z|bY=*G?Wj1BbkAt?R6=p1w|-s_WFc%I;R~9(5ScAudH? zH(Oz6-}{A9XP?ct&^wDVuTYqo-*Z7?mOG%gWpOK7QZ`r) z8ZwF#qii_+gVc`t{s=|pIS52Xky^IHVaV#d6jEpwvKi z@`#V@Uf!LDOLDr)*~OJ-98ci9?iO(4-mn;#!3MD1oeGYL&HFh?&!0RqkeM#I6m)M@ za)fUB(QL8h4Sd~ZaHtn(T6AS`yg*w+`^X%88O-p#$pAbN)#WwYDi8r38`n~X$;8@2 zl>Fc$wejRH9S+F5k(p_@bfAS2UB||SjcQDty&}jD$QQrlTRYEnNb6jR3+uC$v}iM2 zEv8EB|69-TZGmkBQvOu=P~@;M`Caz?RFIrp_LwVXu4gCDUY@VSrWrodK8gS<1!inN zLY_sr=wJc#?XSJp)HH-CmP2_idRy6M$l~I5~?=A^!Bki#?p~ZARS>dQVXd z&h1+EG;1~Z|D;mXCzC_=Vd;*DOH$D1M0r@T5X*kNZB_mG5Is+Sv^eaG|=>Bvf$HpqCaM7o^F$v*3e>8)UA~x?(=XVR@gD z#XeqC-)CuAflj_?qL!h(WIWQ&h}BFUsV=x#7__G2+1KF#%sIvCzy2R7_Ov^ZBJbX* zewkiTJ^Y62?_|=lQy@JXHY4Lc)4jk>VWF}v6>e%q)Ddr0*OIjem8<0%y%pHY8=qt< zr~?`HjSr)>WsaK%Ixa}1y@nG3^?_(RX%pC_*eEp`A6HPITU1QdQ%fjc*|g}?3;S}n zZ&ns6zpLAK$kOxg(Ax{}UCh;(o);|hAI@mv)*DwBzD|$K6Oy=nX^Y3|h6bT1G}JTG zD1P~K|8BM*LYY&gBxtvx`V-0LDL?W?Pdmp){4`0nr6WYfTr(Z5S*NL<=@dorsN)&& zF6k2or%v5J1sg^wxjBu=h>qt`Wj|)X*@3l!S&`es$Hg5%zjbu{LJG-F-SPFPyLSa- zpftYF8FEL~e1`{M(C<3%s+6t(6ZS3`tGwlHj$1{t><&7hb5a7){CZS*w$^|>zxuV z-ze)iJt3%#T#R?fwC5@?-IC8NnY4%^mQp;!yh>R}k`7+PsTP{~ z_qPT0BX*VjO1(_zsEqb<+=SNkSK}LC4R0&6D?%#Q`2cN)d{Djckc!bm0Tau>zg{02 zSx;xroX~7#2js=x=Ca?D2CsDvDUf~Z1~A6S%te*nHy?!eyP@5dFS;(gpLgn#V!;~a zU!-?_E-Y-2?37|LQ^ymXsuO<<-^4~SBuf5J-0WFfRCeQv|4bNQ!`gDD4mIDw0lnjL>N>eV6GUIen z$Kn+^jKCy@THOq3nI5aTfqzBiD9JsL_Ykc3&VMue9(gg4U(9o1Y}XpuzZvlk+VR4l zkwpwq7^qyX6eII9*x`5JOaolh_<)}6bGhM@rxE9+WRvB80bCnW+L>`R!LEYMh`~m? zMUiq|&=MhBWa8ENUOfbKXqtPk9k-z%>+yIvzL=Q>aZ0&(9;99|_)G)ZL^F<@VuL)MZ8O>91+V}xR{3L-PxW|#<^8r) z|3)9hej~d$$tv8FhOK+B%#wKdF$iSqt+giqlQmhW{Ei`RC@Q#N^EKCE`Py_5>#agI zefRv%u}))pqNB897R-^|k6b>9alHw6Jg;fpb-lp0HWS`#5MLqOCw zSgcDQs#{@6qn#gDI>q4jN>{05$-}3t@DI;C)e_;(xn}WCQ$*aRbm@}WuPWx-MTOji zxzuv!53rkqc_|F}L$m}xh%|W&cNN~M4C5ds+@`0Ax8`;1#`y=vJ`g5~f|h=X^sY$` z>MEzK6)7_tp)Gpl4rVr3{ps;_)L3({A=fQ6KbuUE9doz+t>gGFUDU3i(TA$%UKT0e z{&-y-7`A%m?W;92BDWXMj-0W{#BuSo`eT`xcN}RF3VYJc{^i64nrPH>JZ~M-?d9z# z+Q2?Yy>u8N(`mQ#`F~zyCv%1xy#uTu1YZ`vXPLpU`B5bV0h`9B6S*M?O}djDezb~w z54hMO*fyO9B+G!P7%gkAc{X*>uw5D%JG<4M)g13>1~G8$M)%eB+++qvY_Gcen#Hh6 zNZ@AXtMh)t!QwKQZEjtEKikC@#u~kBf zt|44^Y}<}ZTlfp<|HOc`*E~FPB&%(zamiu*e+<{w2vAflW@HwP-Q?Vr&f-Vw89mYVO^ zRN8oQV?zH(Y^}py|0X7(TG2Xb5-fQ1%X{HDAHh~U$iR=05!A~5AWSPq4rq^n&U$PG zd|%AZwqq0y(0yXe#C6oXB_9I|I{uuMjfOi=5z8V~Uz>qQYNU#Pn0bM4mHvX$eX)^8 zr4_ljX*Kg55pibnFp&O9t2$KzO?qZxgXICyd*qQ|LyQ2u;lw9GcXzD&A z^%X%oe?jhTEvmvL_+q)bSb=uR@rl^p=3f|3-{bQc5mh_rWkEKte^_AYXOx`Ft)n6J zI0d=hgyW}BuS4J{G{gBpO$!n{_)f(#%mEfpua&K*_1*BF~;8wyc3U0%|I*c_HuEh_5Yv@&I7zhY7Yr`(t>gFPJ?ss$q)+ z5t`)X3YD#-TQetK?v~%S0-Drtf6~`@pbAlZjA-ELwy*W4(GBEN=k>eis*}Ce7o)gs8jwlF5{m-55Cf^T~5%_D8FbMLD-GN#*^MQa``S(Jxr-R9vMJ zvWGmDT-*xt4EM^AZMqeDDM$SWn!TTUL*R`d((ByCz~5anuFQ_^WmTG|-8%>H%#J-% z48d(rZ*;0RR#O2V#j z9LS+Y2Z08YGm$$IJ1i8wF{p^+PjF5v)Q3Z%(^3IR9Qqqn)##T`{Ik+gHGSx1!r;p` zyQZ}-HQXfpo$DTRmGs229ONtSgaSRGBtz4KO87y?(?bh5xO6coIwsWV)l(j5<#X@E zQ}leh)x#HFy=)w_;AZ{i>A*jy=r4F7`hLK{(VTdNOd?(lb3YIMz9D+M6 zu7PE7*F}O$aEHZxafcv*;1US#9wfL02q6SpoZ#*j+~t$^-~aWh{;PVw>dv`y`}W+K z>ZJ1`m(wUK1eU*nYnL(e*VyVDrpUqv(D zpYlJYr*jH={iF?x78dY%bfKjKDcs$x`79EUSA0YlngBn3={?~~x7Qon{ICcQ6jfWC zuY$Lc6;b{OZhXo=zE~EuIyFfu{?#@q+gVPtuG2Mh%S&i{OU2(Sdzbb~so_FkxkiPi zrD0I=eC-)T$=aR1ZN~BgN12!c>=VltTvB#5$oPc5oJ_mxnX1w{A*!`b#@1{_(|Xe? z%#$ph_@sRp7()9D+#0zqW?~(AxWV0IhO-k1@DQROTud1^lZ%#hTFM4eXwGRP?E}G` z&WZx%q*6>G8zGV<;Pp44^L>1;bwQ^u#%p$FFdrN=&E@>^Mn7aqmNEW5 z)n2|TZn1iiMg5gKB)?gsWJ`s(yVEO`-Z2W5$`(K%#P%zW)_gV|)~)B;JmPoxm{CoN z)m5%N8P-$N4CLe^gGs?*N+&`iqUY&;Ai+j+zhWnJ&%V#v)i) zc=tFga{h4jMDFQny~EpXwv^7uuj+TvOhpksM~!U5N*>T$GBrRx)MpmaHZ^(~#%P|5 zG%xLuRUVKK;5>7X`2OWr42nFpZKfh8Yd283t|nP*RYJLeI`ZAJ!g0h~BVoFA#hKmX z!nj4|-)AUG)v3Ge@W&;ey5JGWz!9xV@$;Q<8OuTWd6O#g{92q84=1~ zsNipe>I!I+Js%XK(-ir)xme1JYI$|v z+Zx+F`*)Bhe^aEHA&wCic$jc8 z-k+AeZBX)ac_(2{PjS{CQxH1rcALBDfHEv?#rkU^*4`m;$#HzZH(u{t-gE5DXTT@L zq|bIugDgJ5tyQd@sS65njjgwGQKogDq(901zAo@_U(`%#G{Y;@t*iRNZ?Eoc1P!*| zptBp-OXLBchLjjBMn_^BFwN3@yE4a9FX3uL|pbJC80*qtFb6kl!kFEIzLSi>-@7yZ&qx zn6W&oFoAGYy!Qt5qiGpQ3r8b_lK`mOs^J39ii4Hbxy1_un25-uo=wYWnqtCUb)GSO zN{J^9PY=d}KIi(@_4_drAB!kRI&gR&2aeF}x~o6D$r{`h&=Z&3*1^VC>fS7XU;=xY zodUR}@jW=crRc2ln~KgEYB@)kx!2QSRy%R#_+hoZ2F`|D?sa>KD zHSv+^?4Vz994D>QVZO-^`D#~_>Rx&1ki|Lw>7b>pOE6Mn-IYHV&^&!&_qIbC5i=DJ zr7sZYp>5K9RvKbOs*msa!*VQ-^0;wpa?*^WDw1v+NgQjf2m9OH={L&!@5AD6&%?c% z?pFwk_CYEMNu~XA5A`DQO+?bjISH7mHGLH1_ ziZ9M75QZ@}emF)?(ZYl5?IxAPhWT2~eMNp2M@{#kl{B23c{rm2`AQozY_7tfNG@$V z{$m+DD29&Sak6HfslpU;jn+@=8<`h(7?~yb zkUPeVenof*$)DjH1UK`4w$T` zqhTdT`8|NzPix+hODapH(sZ;>a6VJg{fi=l+|KCHr0Q=DLTx^LR0W+jwib}&GXhsP z%MsO$GqfkSv@38bpVm@d!p%nK_4tB*$>%$6RK`I)Lyn3|w*1iw8dagB%i5>%Wcv3^ zy0q?Nnke&MrXMa7if7d%>RcZ=k%&9c7h%UTd5|&rFsqL%QY9)FywZ%f%g$Z1K}DG2 zA5+>lBM}?of)cb-r2+2tr zYHcCfV^u|$8S4e{{A+CpLIx-a3Bc6rq@W8@Nmzz?MwV&X%`b}HEMSNpSO>R82+5I1 zYqK0Rzr(?X~G+h}EwhVcYxMGQrVD?}tC#+*z9`kV@5)d9VA# zC}y46(Bxh5GYV$*L4lrAYq8A=SgaQW#YKh0e3FS@ZJT~A_^R6S^soyJb+;Y1%p|4Z zcEZot8zCDnyFE~?$=@qD3kXJ#fKQ$T#ZNd&9%O?ytL!4Ls{znQ`p(Orr(X#q=XoC*;{fsbAxEkmC22+X)tBNFs?4#IH+niD~v{3VtV=0k0nQ2cKQ2M^Ek1vU< z=e%^V22nB^a?NV*OXa4psBJ@Zx@miv7)&Jqm4KcQ8a6W!Q2DHImmd%ow|W7|-1R!W~T(R4xnWO9VR!=G^`HQ`9V zQx3mBjo4V4;v?MS6)cHgWq^GhiT@3EjVfdD@;W($X$$>GAk>oM=-UPW^gXzFc0!Li zQV83~{S5l!iEAmR#fO1}5u>q1d~ZFAhO$%m^}7r5sY2?$z958;vdL}Y^Ac`2KPsS*yB&S*uwJSb2MeNMtq5V-sde~9eX&G4V$86>%x9-I8js! zFS#gU*MM)Z+zzQhv?@cd*>eBLTBdOeu<~1IwgRLY$hyPCl=(0PjY=}n!&}LPDW4?a zfst;}8%uE|`peDB*-07L>^Io%zH#TiF9)Hd3F#1WBuP2MU9i(h6%{e&J~G4rW#KQb zvd5u1<6BjP{t~TOH#3~WsVAzz0^stG;TfrYyd1*QO20X)-M2Jx5kEF1g44(v&UR`P zcH@79sL-q&x?#lPAk{7uf$x6=p`>K3TE=pM+wh~!Y zAc13{!wbra%ju!%yYM4^?N!a7662$-#Oo`c)vwS+<%*UcR^y=0EZ_*8Ok(S(dz=y=riPe5R)yqDui?a1rCm!UQjt1SUe1J-9KJ3$+n5X_A`{a*Gf6#6{fN($4b_XNch5^Kc#>!z z@^kB*)Q*rB-m`kC&DNQUJd4wmRLKa`P8Vjq>s$(H%(YCj#Vd`m^$5$f#gMr5bMXII z*8e&s6b%K^YU6Ai`zY6neuQQ;@;xs1uj?in4{ZYlO=7H>SKF z-KBo42U}@AIvp1S9UjiFVZ?gxBLgZr7+1w9gw{)46we zdyf&J^i7T>5@=3K*^_>(K|_z1@2e-Jn9%a77h_doFq;ad0olkvyXl?N0a+M)0q+-Y zm*DpFum|kZn)-eOQ2?jQ`kG{j)E84W9!#@!?MRV=bLKp?kLz(EFEWpwA{*Dy_NYdAFMoCehk$UpQ zIM}yg%U*wMRL4?PH5O_|>M~Ki+c(`wzS$i=XQ!~~C zI%c082g6RUT9(a5qLAoxcD=Vkr{ANX_>6CiA;f$N^q}o`w|KA~kAT{<{){0lr+};- zkYNp+WrtQluVE~sn>79^FS~b*s)TtA=B(?!d$B-kiZ;kE*@ljUOW7vh{VVM$wMZof zum;RPR^0Hqrd`Pdxaet1dA? zrF2_OT3m5u1!ITT=lzuCV~d)fW-0Tq?0!r@u4}rTrdp+4j3v`?nS8PVJruD-jpFDs zED8hi=>)%7)~_Ag!;cvSe>cxPTT-2IG8-#6-1$Ia4pzmW(UifwqAD2?e04ywue6+H1pN_jNj8MYLTDi503PbgE`ZWMiNgW+0GMIaroxoo{! zG1zwF{p`P6`)kN|coFjFySeCZ*038aT%EIs@+}?L=qufB&14pY)m=QP?Cdh65C`R+ z0kQY714ww)6mcR);*P&Nf*UR^$}=vr@aVKThb+bdKRbQdzet3gYsP#-q z0sDUQX00DR?mJRxx>=Fv+rAcNXVxQqop&Q;%kV?S+Or%O>{%q!vFUGU=3%tSK89Cd zj~qB)F-*IFoY4Vcd`#5VPs&WDur>QfE!$zQ zc+Xpf5Ms^;Llj+tw7f8gi;xA0L?JNkwY?^g3vUFj?LP6TmgZUs!Tw^e4YjxMJCw7b;D}P<7V?%0MXrG;HUE^C6DU)Uh_g$%ocC zNwdKUI}Ypge2$6ocM4PulE%ZgK}F_-oUdn$J3kNfHBNKeuL;?Jzu3>era^=Uh2zoV zSsQH{t^+!$J?o4eqYgOiE1Xq#mG}1t_#TiGhuA6Hwd}HoPm(Nr1q||)nBU1A(Jm(+ zmgm?{ap2a5N5u>WJvdLCzb9!v2$FnsP`{n-IT*r`x$kYHlX=CBz&by5f&NJtcf*_1 zm(e{OyXiFlm*2`!NBs;jVx@;QE6ak4?XRpjv%0jjhl`r9uWWb@!rF^4l*GJJuE4b~ z+!iSKOFXuSuK|z+u#2&kHK8D;o&PxHjZ5zux1s4y)dTJGoA-KB?3}|&TX$fSrn;9D zX0kbpc5%V>>Sr@%tVQazP`}>y!u%R4G97^_R65vWw%-`E$_vUHhG9e2ZT(^PuPJY$ z(Jey^gH$%U?!`vTRk$-f`P;j#ZO^MFEA@2g=Qg45}SMLm$>=edo2U zeb`_`k%ik4Ji?-7y@^5U=UcARZJ9^g1%A9akzgjSNoD)`>(VHQNG@A)GU+Hmb-*&A zvEGx)bVrMZ@uBEPg*tbQW0y8bPX;mgm7E~B>YDZkOtuk08@}+-OeN=2mS5i{<6rP zp|j2<0tsMekNtQn$K3NvjB86B9@ktSG_Ds_P1){pK2MkgwEGeesdm=&y&{s0;*Um& z=bep8k$Mfyv^DV(cH*;}5o&HAuJIR^Zo>f&@&cS|M||>kPdW1eGzqk`Ve!IGb`{$$G;zT%!K*Ep~19um!sxD(CbSUn4i=e|XV z*5|qNDPZ1ZYK@xwVNQVH6U+O{XYt1T+9HYJ-`wP;a_BEFqol=UL)yarxh_1Z((6^{ zM7__F$dhBR47|2C;MLbpLzl+RRsZy;>7U0V-Xw8(x^SxB)zJ3E|5Z4(F&W9ilxdPb zFgkW?Ey1FJZt~&miJ)E+kMX>MpBmdD7Ix@TGp@+r0 z%SotG)5gp|?{5xsC$1UjpLn){tGp_+T`t+}T|8I7_o}LLWf}dWl*AD|kG*jmPltp) z;)P#Ja!GhTqbvCy7rfiqzQj`!in3v^jWgO*;@QwIL5(j*3kZ8pt<`%JfAH5Ft+JPu ze;0GUVPzZWGn4;i$LtkP(rX7BH(V3^ujS)_+OiKH z6bMhlYKJ>iIF!rUW;{vJ2;AScfplKdZj|0xK!{&X2bWLJ;5uB^{;*rb!tsQCh=Lm? zeL0UIjfJ!n_@7*^7b{N3CEcRRVn=%&KB3__s>gE6zrnmnQ?~JW@?LN0lQf~AzS)^v zOccPi=JD`O$&X-0kx?ygZtXDVco;~#3)&31viLWI&piK!@EIV$%m4r8t9E>+>|6+Z z-FeQ|5&8CrCKhY0GMbhYRReoL+p%%p9$kf)CFBErUUz=w9#+XQZeH#b=bjcqW ze@@3<=3}u3{(+yy+H%`da;V4D<;Qzb=!ioQN$i!gDb|^y*RtP4^Z6jk^zXcaA`>SO z59y%N;V(1uC^G>D*eDII=-ag55hmC6YG=9eLgRc6V7^`ZHand!Kgu7Jq}&x%Yp>ah zb{bud>4yQ;9N31iZ3o&mj>W;x>LAVECdIeqJ42@}UJ3c`wff~VNB-cI=f|tnsfPtB zaRJUha-~vaDoV$U?%PUEUJW-}4or za6JV*hL<;=rS?^!?0ANz9l@7GjWi00w?;JGydtvqnz^^;`|Y>&lzp_E*x`*dhU~%P zE!aay0d|w6SGws9Z&>%c%$61g zUaXA>b$LsnO#LAsb~S444wr|dI+xxX$RXZ;QCvC7HLdsDn=5eno5Y)@1B0062zkb= zTbak=B6-j$SdP@kv7?cS{O5@f2C1|G5trk}>%its+Tb;3p8H%Fjn>X-4C* zw#{(8?zAdX%n`i-5-JFxMTP2PCW#+yC8@ME?9QSpv;K+XA?tFIYEUl-Q&jLkl{xK_ zX;;+xk>yk`NXmcavlJ92r0Ktrn zNF(;%EVk@iwA5D&z<5RprUTaKY7hv})A4<&>_`5wGV_$$G z5sb6sF!?TeT$=I$*)Cy8_TM(zg@2w0$N}PILFbh8 zaJO-GV`kyhvUPWWyxcWC%e!aubBN9^*y5~sgC%fI0K7c5vf6(K%e&Tf`a=9lICW6l?HGMN{; zdjPYH5C{n31#*G7UNm32Z-4?!Kp@kLp0cy$e^SwMGjnl)SiW%Qn>j!s=$vX=U|kMb z4+jTxGbg9Nw$QS%g#uo#|F#J`KpWx)eVH1-&B4XP#VZKn;^6xqW4#Q3=mL~% zJt2UXrEqh~K+J8;oLB(%E@tjFP>318^Pl7SIJh}pO#4F9|4)|uehXR~BExE$j|CnU=pY^_h_RX{fd&N7N0Njkz7E+W3xg}i ze~pg0j~w}fL>@3rrlIu~9~FT#EWtIb=_Z*G8wLv=2??koBCO?NCw%o8K2xA8^c<>% z;&5tj9kOIiPF`DCanW>vW%C``!78)j3v>aHsN!P(o1?wT*mOYMG%QRYwGwW`zCoHV zjEsW%yJ+BZ$R>n@t~dQ4ft-1vK=%==T&Rf4m3d;5L`X71LQh#MlmS3DReRqyxHHp> z$4#ml{O0guB+RBJ|60;&&s2kzNIX+946NYX+rx1&-2}HxeB6)+BC}Xy` zmV%oP_Xc&`fINy4-)ao!E6nqx8dJ|kIcZsZ?S!+%AFHPPY_pBYU)~fr^$=T<239B7 zFLdtPyAR%d^Qv4Vg|5e`3?3?Rjhc$C1TN0*qVM4mKH48KRPXxPG+^I+{%+~|k75&e zWM|IbANpwKD_q&6$8uw1#qXGiTk?xfz*Y3-Yi5&rkwA=Pbod9GrtlcoeU6N*5*R&8 zoKw@i<j(<1>3mANTX!nR|H(u6gr#C&s936G@UD4rfMOT1}D*t_n{q z4YvwgN9}^5hvGQS2C%ErOY?z1ygV{|JX~Uc|E%&d%Rfta@Cp3We*yowwuObM z>CNbm?jPMG&!qAoE(9(NIz;ggBS=4nC5GXDWJwHT|EP+=w3c*ph*7_n;30kuMdZS< pFcd z3dP3<<>c&WYG@1PzNs5s02jo7BI?t3IH`8O>bSJFng{^-$r*^9CI0e6o=RhQneHSv zjEz_o<(ECpi>Ah6$whS?A|kZ_H6to!wvODTFI3Vu%M3%}bH|I*`a>qM8Jx*zLaQ{@ zCdqEj_6A7Pj6(or`p@M5O#Dy%e{LC>{x92%1dQwqO#c&tiGYcbfr<5hT4n-<|B(bt zObqOd|9`&!cf95fWB1~nR=VtB_v9VTLa@4KX9A}#*<1N=Q+$v+|>De^jwO#YP zp4rbYT^CheR-Rn1UE^6G&W1v$p7@F$MuqL&*^u0ffCy{?>M|QUYm>2QkvbR_w-!c6 zpn3)d#)g5!q$m!+7#*6M8<7|t|84&|Iy3-K$uxxaR$eG7vy z0G7na)?RMr*3$k7Ig|pKg0X-Ec6YxO%^(UJTY%94Xku(;0A9@gR0_!Xr9`FGK=BG{ z3d$-4X74i9hnM>XchCAg;pOF(lJkHfA;+pC00J&t{=u1%mwW1}`8(!+r_KN#EAn4^ zTzRp6Lnp&3Au25>Bk7obyTJaK`(VzFt>4kE&B2XlBN7xAwVEFygYtx3*Gk1%Knwl`1_~3{`)lRY}@S< zv;F5Y5`C?c)ALJj^ZRC-wZ6rv&DF*6M@@`iU}g#Yz18!y$J*5VB~DIGLQhIpS|~;T zCCS6>$@wMke2t;Qqx1XlnzLGxKX?cX`=gZf1UQ4wa1bL;og^di{4k9Y6;_1~88Z z@XXL)@U4B1-}|k~{HN*%bZ~WJa0kG^+~@%Isg()HPhY<~EA~&oKPriOe(mdz?b{AA z^@F;9d2#q8=`!yK=u6&}lAGBJV6AUCFXy}MqEABLWv*PwZ?>YMCntuyAH@P}io)*h zQ4+H9&yVb8&nlv8bA23p1E}H`U*>OTVSIIQ^vSR5H!BM4-mXpM@l~!>jP1{?$r-Kf z86cBNt215W8+z04qC_W}?}OgN{2%bQPO6`Zm7ny^e*JJKp^Gb1oY zz@NCu5fH=X)=%8@^fZ`(^Cy173G@09{)69J?Yo=rA0gS&-}bbhtQmY8)5G4~^RGJK;~!@DAGMao`tDFxZcGZKz}(qHgiy%hK|Q#<^ocdi_z>ujBweVv zUZ?Q4r`I(3C+gNH_@W^?dNZn>a|Pz|;Vu>z$B2cXRiw#412dGJ9O`_2C&~>FLy71f zRmwGf=qrR=Xy)f_$+J8wM}5FCVIJNe7t!T?!p_k;_RDmhwnygoen{u--Ao zTCZRw7tF-|j#7qc>iZHB^|C51t>D5-A$#rZv(GV;8O=SUHYm+WJ>9LGzY+vLn?oKV z1`{8zYEQOvPc+7vBkFed+5EI?Uu|2(3i)LY|5@QNgTO_VF;90@od#&rNOAp>sEDN& z&5cTEfTPl`Xb66nuz;ia2OWvS7J#Zgh;AFZw^uhDCp_7Z0Wg@rG#Jk*jmTx`f&>lK zBWI)?H$M|bATF{eeb>VAHRi@4;78onw$*}6bZAXSy0*}c90C%)W!G*W89wCTXOQLe zo=IhNlM9uGOa$ALm=~BdnVVTtH#gGink@Uvu7_A zOR`7PrHfzFi}unQ@BF5GDX;gc&6ElMC0*WRFSK2a0|AGys1g8lBD~v87XdSsaA+WI zZJIAcfp53+BG0Oto=-B6>REAgm5)?=c8~%9#vd|JnI9ajiMV6Em~eY+H5(3*a;A}YsIT(M9kA;R@hj@y!7KH0y|XgwG>id z?{M&j3f`_Gq2mM8nuY+)cDU>;b$hxLR|l#7GnC(C%Kf zw`j*6Z0huTvs_VhaY3)VMZfZy%cmR5w-qmGyLZVaE8TKHJA-QQ<8Q@1q~{CUFNVIC zX?QYV&nUV_VCp_H#=5JA#zh6V%{XhvAaPKB5Q!Ke+};9ieA_anq%_k=x}VGjhhGL73g+KG{#>utSjOs`Ju7YzGw%w9$~gQPv?(~Vg}jaE-=iCL3s z=dLOiYbXJ~A*$s8x3_~zZc$-&IDcc7bE#`_1e`xk!TUrNDxjx6{ zY0BPlKAI(P{3&~Sa4`ax&7<0;#E11nYS8LPBU(-p^C!INxXCEK;{{JDKE3RUmw zQ9r89^f#KY-NK7)cDwNCT?}7aYE%trO*ZV4MWb&`@$n0?4U71ibl$qjeeYQk*LSzGEV~^rcUL&1M{vPVem?g`O3c(2a>yE zT_9kzUih<>uh39(+BM{v%d{;*(SG$n`quUQK(*~i2Kki1ma9cdMA(vO3xjJPT(2k$ zix~6hZ!oy$W?!fhOTccC!iiU+v1}Ytb8DMnDs;P`=HX}S%LXgdft0iWi-S&rVj5}5!3YC4kmWML(LSQEi3~E)#AzXZ6iaT=rn$ma zP`axK1EixFG`K6}3OwMe6(-pr>dsu6%N@U~!m}pa0c&(^j&}gMXVvS8_x zvh<@!^g1{~?sEd-pWnfzpz8*DZ;D2k60@bE9E1%sX?qigqbmdS<%v!TII*Jml#M&e zefQ*wjVF6Fv83W5<@7IvaCM@!5X8_OXufmL$-*V(X#2co>uj5*io#2P5V+}0-sO#f zCE}^Vv{zR_Xtu)-W{e~Dn8e3sGH>abJ5Jb44df{pVqG0FbM+xiJO8okGf#JlFy~R7 z)H_uOF3;ete0NWqwa&H!9*Tg0q?B>JRN)jp4`ji0Kc!YD?h_uwUsSg&L5O_WaPY7% zkUL4PUZ2o7+O)kf(;W?DBECOoG1sQZb)FR2xh5Lv8x(!g&yAAMa1A}lFsF*Av<*Jft zL-6NE5F?Ur+!KV+S4rLG%BNB#L`fHFV>frlBVM`zRzpX)DQLuiScIwrME4wlylo%_as! zjcwxP{gKbzRvlhXWcF)nL1c~;-3fW1u}v-4_UTGdh& z0(gMRUP<-|rJ+CXIOJy?v?OBZz=+WJ7*avK2yW6idS{maoII-fbBaSH^ydBkw*ksD zu}GexMO<5N(I%+U?p6&RiNEh0SmT0|X5#1c^Os>`JHJGjf_2R7`tt3^!=zciAE3T%kGg>rVToqHs~;>TNo*p_jg#)Az9MMY|!V zJl@smjf!FqEZh8=nF3=hleSYgMCXeTa%Wk~T;Br1J75!UG0!{#m{)k-PPw@0WNFp;(jDGmrsopVA*b4Zy>lo$x$#=IFJB>mQWpKM;zk-EJQC}0-U zZsg)g`ki7!in~30WSEuY(SpdKKOUsWIK-*NqncQhi1(xkZ7@(Ls=9!N@}YJnmm7_Q<=O)z<2sulrDF$2if4@kxgf9 zp`Xz}qoH{|HIS;N$jbS(TpXs>g!+6}KVen0UEW6{#TCe|vbGe>0I(hSEttL}HbSCL z@a7OXSd>%0ydXDSM?&iw7ol}w9)+iD1WdeBWM1-;qUaU&ySm!AL0rFE*ZBX1fRaj) ztq~guS^4&|L>7)6MzNVbWEI~DAs~ccjlkwT2K%T-TVCR6uYD`v67x^GmGbDDL(~Ar z-gYmf=}FKagVynv1pnGZL03N9Ts{KFHW(ltWQsGWNOMdb`6F#Yh>o#>z`_3m_JxJzdx zM65?v_cN}x5CA%W^C@hTo7Ct+AI!|FZ}7FtdE`^F>1_grM$7rlo@WUQ7~mf9uOS*( z+TC<5X}J@=hJv&!R2*l}$H8e;dq7@|q{PmLZbUK{-)`Io@$Usj)-Hg$d&47_Y1dh5-HNX^_(sGlj>Nik$?DA#6XqyYakd=Ri^}|j94o|Y6vwf@m?k?$~ z32yj2vWM9yGC6gp?TO$yUifTQGorajQOA`&|;sP$|=d zo-WdZG1I^iT#Lh!dbT5l3Z~yHqw&^OT!_cd1^~PD+!%^D^}Fs<=KO92EeMcepB+XH z=2BD4ot{sia3vw$=mp|sC2Uky|D(JjZ3N6v6>RfG5^{ZYE>JF|`_=lUPGWPVAz?pA z?L3qM%R~XSfwwk(7Ol0enU%>_)b%3jbxW4T$yAqE80>ZP#AE?n`clw1yOu@9c@9U( zSOpR=6sp!de#!Yl__J~d6f$tFfmAW+3pr{5v<;g@QEHwB#4@^5!a90cHLG{0Cg+|4V6;D|-rkhqpK8Ko;; zt7&`ZAhL-0SqIkL-8M3rZkDOUdq*(kXqbr)6Kz7%hCz^Qh|n9skuVQId#J zmN5*}sD6_mQ){F{$xJM-S}6HNzYs<-6P$n#6NO1@Q@cUuD@Z00`7-%onY+j4|C~hQ z6c6uj*ASY%J;xS7rM+slI_*IVmdqf~*n_T5@8mT~Cw?b=kjny zUW&)pla>kL$XklWWr>b;TJZe%NBvB9%ee9 znt#zeuy-yny46tV%6w#KP`W>;N-U5oDLZ~7w1TKSXq=zRP}gk}_?(GT!bs6y$FC%N za9f4=*U0vktkYS({Z6&au1-&2bCrO{w1Gy8Ec~_foNaX?!hU?%9>8*}uY6LG0Sp&W zFUWU!DZeyA06I)v8$P}#*~Z^D)LwT6d&|RL6%e_)nVjq ztQ$`DgfixOlvD0vcNZdXOn~5O{j@kz;CxedK*TRee)=A;5aT2$sFcQ&Rn{mil$#^$ z16W=kg>Gt;6v-hC29HY39BC5YGWK9AXXV|>6yQXLiJ6v?sxEf1vzl)$Sr1J;cV%;H zoBte2)f*|K=F`Z^s0q>|DG_rRyb1R&kVz{$XRMBX)v@9yWGpd(>b^56WJH2J9Ac*y zQt%+BW*3kmywLwUHEkdgCqkon8U@D=97#3%%Fo8&90buP`pZc>OPp|`?2@c}FW*qm z&thwg1YY_tn&Z%f&It=E1^>W@=G0h&tazu#yEDeM$&8M*;o0_}r8gsi4;tzHv)yE{fsK~;cImkg`Qt16WuqbO5@IiX@O-& z!s<5$80GU5!d`n_Kn{9(X>0=j^ml3eLCCB`i%xFaAnTiG95w^Rc^+QF0$hi-Yfz?d z;`!J6vtAWty3WAs_qb)CDYl>WuqQl-pZsdQesrgd;&5oxj_mJxG| z$cXvt_*nxIgusu%2Tq+7UPYGkYk>r2QuadvxF%uFC~HWL;A-|6OfdQ@ZbmTW9PWE? z)+gZ!8!`iK7*9*ViMF{`mjB%z5Y#o7IO7RJWiri+5Yxuw_4@~ROcSH9#M0MqdVdN6 zW?c$?vgt9mMng;-YGkt5AbwRn{m`Z!ND)or70(DgY z2g?H+B|p`Z6!k5_4$s<*BIOviZrf*;Xq;S>i4Hha&GJ_tp1A*>iwoFyH`2}N$%*@g zlsx|Q{5a-MxaX~z^_|mJVoyX0k~bQ~MMq_tI--%_Aoc)9zM$`&!eQizKs}mX_?^yK z{}&v^8u>I7i=_cF%2OKOSBqQ5?dV8nWUfH-3ddqMY4TA;A~$e$Ih;-4 zHlhfuE}QDFCw5>lTL@$|a@3YVOGQ!dUO$bKO3@!I3Ph(zIrn_p#@-X>9=Y`M!fZQr zVR5_|f1g|LHO%iz_sEJZ^LZ%ZFwa1Gf-Z@AH%D-}<;>d@m#MvfcKt+EIxOa6ARBSl zL@KL!QZQ%LTt>2yLKb*d##}m0)ird0&L*0q0Dto59AV;>T0~Bgaw=v8dmC}|toj1j zvj|FtJ^xrDG*7b~&G4M(*u=?NozDHC(n0e?_-0J@V08~9QbQkhMdM=dOxj@24P76o zD~p|p7EAG_XIo}nez|It{VT(47^SL=@jM?{ut$L8AKM?nAE$1N%etfCH-vHH;&}_| z?wBk;KQVg~1JQJqS5@-Tc64*OL9-i+Raj2n@iws&Mwma)f-K6b@R-ARm1ujz0Rj{5*9&usBVC~oxEjST6`NIi0c-6taD@KCwhm-B>J6HY?dmPAgk zAtzrnG)fJ-=5H0r^`JOgBN`9Np@rkPCV`lpuD}qTvXD|Jcw=Xi8BDBUl7G%4DDi+$ zvY4vbMHZQ`8QNcy>PRs#SL{v6oR)QVAX!?hHmu#gmIIlD-{M%stl0)M$-uv5wx*7?BUfHqgK3<;hS9 znbd>bI-4QfQRJ@KiD)bP%XJoS9Yk-evKI5xY*_Bwz4B!?sRgjzql>~^b5t1iw8!1q ztmR3%uBi05mD%jM&ITK$vXvr@hzBE}Y?CRGI$JPo1ldLQY`fMpz%880i}w_MT~-r| zHeZTQL)ta6MR?tKhaq4=&P}>8W?x^gTb7%u^0$cK0Ec11k`rg{A0-{sU|&IdT{mCm ziYwxlaD+Q2{bd?#)mwZzDVN=`w@h{(sRNJ3@ z58&zNHxI5??S5%7Z=wt{zsgzVIV02ZJO^ubiEEVV!F^|aUte(*18;agBiu&0ELYFh7o1&iYdmCxLzRPTw+eRFk7ktsXm_$2xHP52CaZKIMY>d@9u%2H znhW4nA|Of`)YB03^9mc~R`5>yoef#gW{#{TR7R`d@z&*Zn8t;2Ex|zqLhFH`ANE4l z0gAxd;!1sVr)Zk9EfDx}KsmM$)`>>#D@K<@Z(MF1{W6n`R?60^5v)}YcUC;zL$Wwg z^9kD}dJ`8zN-X`Xc(jgU*Y3z1)lN~uLSf#FsPSlu>vF z-)oZ4XfQ*-55WlMHigl8ZNkCx1$k|%#CmM}YehNCjpd3kfg&({%#MvpLAjtoTFQ(T zG{p}=DQsPRpxW0nhsV6+IST zqQ4#mb+^Nd0Y*Ua{1Z=<-P7JXxJH}44=}J{Z~EnGZYz?C5<4Sg5Qh2VZ=Cf{PFmw} zh6-Blm9w9zhsuOXPIsC6NkoT4XP>ptYar)JAE!%{Hrx+k;Rxgp|Edi^dM59H+FM!8 zPW;(~$nN#Y!|9^LI5;8_BZ9a6E3yj4-)ZA=2vkTTTBZ~3XOC8*L}X32i%*HL8HeUs z0ZRB+I+{Z*^!Z5#y1;EM(!Rg<+0dFCBa2}h|BU3s)-2A5 zz64BSVTt3noiPY|d2_jkNVcxa8bQQm1R%&Y6O*+P>Jr(As6O8z({s4y2P|SFz7Hus z4eQtp>t^*|7X^qz*nb~DJj3&9q82g^)d;TVE|WQ{KPu}<4&jRb9<6IcJRTXvp)L5E zkQYt+5!zTKBYs&r`2}EYgl8VK!Dl6ziC+{PK$|=Cg{X=hpGk@?mP@CWZw}KMV6Z6O zf7@Lp>m}sJRpK)Xhwjt08@~Tt6Qg#I4s2k_-AxQ`Y{0BtPx(=Tw5RC-&6mI_f?w ze1DLQe9I#5G{ICXYWlY^_OM3!^cQFJER+K<&}zT9BhbDHAWK*2I-@S_n5TBmgE4pj zPqP%8-<#jgoq+)josG-eeyIp3DyDR}48a|sxj=)LP{w8dgC0VnPWMx5z>`>hILxdr zM-{h8Xu!v>{E~V+qL%kW;Ulppf^xmRqWps1y(SAA?`nw$-y+!qVQXml4Ks$iU@L~` z(5rh#5zYm3o4O~rneciHm+~kw%OUmDxL5b7&qOe6Z!-)y5@@}{zA1DM-#V?rk(}#t zm_{@((aG;~*Ktcg5FjSQy$aEOq!<8m^`GH#A~u}r6^j~Vk%Tk^A}gGc6voLW7VQBz@q%*HHbutD{l@p0h-O;R4jLBfGqfViILyR zH4844zV*6+v?i0#>YcA8h&1ODY55qaZG@#r2420K5zqAzVa+h|9I=UsU9aNYvlSdM zq8?;$0H(5XJ=0p9S~>N_bvukc9ZgmHJg7`ez~l=95^6$d@vMDJ5zFu{xi*y*U1sj- zwT5A>7GraPpPziLc$2Uz3ri)|>jne4k=FlkYfid;83+h>iySv1(5YlF|7fs^DqWxn z*}rLW6X{I~sGT7Y86;V6Boq>UT~D@f5Dqkh5afE8Y}yoxx9O>>AA5*oL>-t8ty1#! zY6B7@p-~Q{L3*lqn~IV*<4j7t!)K`tO31kTKGO%fb!@jsgNB`b7u5jNX}=cLT)ytR zuAd6b3!O8wC9UrA ztb(nz(?QE*rJ0#;9jnJnXrQB4%>t@}{m`hKVh_C_|DrIixZ$MTs^A5{pnJW@;ScK0 zLlcw=Py~!3?iUS@kkY2Ud`QkA{Wcx{OnwFm(+MBG#|fR?+QmtWMj+AZUKijv6Gt2) zzHb<#|0UY4zZrcSM)F^eKKW4GdgtI;Lv?u9eJ$AGYz0{*ZM)P+)9WU-3Stc%+n?}d z9#WE2mzNP!-Qq=_$-Y24fe-h`TVX%>cWDu*k1|!SFQMTQm1o~tD<0oLjZBjeXbQNy z1XB=GyjJNK8Cv4Xtf)QB=n|MHH%8+D1a3$-V*$u{(_Lv+(GFXFG6@m0J^v}OA@3!N z*11frI)s_zwN8|ytQQ$Diny3Axz}=dYD;o@!|LnH85oR#NcIM;H>kF95W91oZ3P5VPXD~s^W1|&7)?X2EzTB zh)9rK3VA|$D|1NpM$fe2CZmhk1nq{v)iI_%gfP3H7rk#`AODg3x&;Djy>7zi7Y#aQ zVHGopre{k+iYu%>J)}H^{C)>%crQnO$#2jf9f{eR+|+9y{IX{@`i+y!OX=-Cyr#=~N@&q-XTF1vTe!sO?1TPR zCb`GI89~Ka{Cf$^&%YKYr6b8`ag4j@xS6_WID9ebH1mi ze3}!PxvE6i0NCM0BbjaQe22zB)%lvik)CmYewTiglv2U#spQA9(ZE3sB0SOAKu(Bd zWPs6tFKn$X_+{m)N^9Eoou|ZFG5Xe@RHGn5)Hn99+d)5b6RuYU@k&hVZ`9uPa44UkzF zmE{OYuq0H5dRZ8ootzt;MbSjmdBrKbM*5tszuIoM0`$#@htGOhrJG!$6rdC0>&|fv z8#i1&SnnkChHd0My&w^oecGLiT+TF%ndvdE#m?Nq$-zU>rjtIjw4m!S_PEBpyF>kQ zJRhMQrH#YYgp+v#NKz94hjq6@MbQ`5m8>lI!i~w?q*H}U10-ct904CIGEw>8f^8os z_^h~;kDXRB*c1=r@|E9z4@of!(7FxqdIJE9^k}sRP9w4A=yw~N1I=TAgyG(`L z`2=UkR>z=XJ-7-7m}8cF?}`d7;d#AVd;`S(gao#GpSND`uQ5q7B5oe zf+tk*x7wJ74WJ%dSE-J;%pS$o{aRsxji{156CI?v+AcJYhF*Fu>IvCN!I)C= zJ2AnW`mx16Wuo@A_Po$7y1jL6l*S^t467vQa>Q(;eIkykpSybmPo}v^Q@Oz6Gs2N0 zLk}$bZ%Q=v*`WZzq8cC#Cqen-q4Y1MM>h~K%LV{h$CAr7ecX%s4b{nXQUbEPj%b=8 z2OzUqb7x(X7_9n;FKrcM+r_Tz+XX>r)%UE48Gue2%OXllr1E~g#NtA1qGyoyEgb#8o_w^%_3A2t>8L$KNqXa>?+*HE>d&-H;_fmB4 zqmVDC=FOf1;+R@GaNtR2tPY8-+aGXraLUz|WYlJ5P8s))#zKQ~&fJYZ0iV1tJ~?d|#eQ5?AN_*)r`=AVnusi5f6Y|Xtl@Mu6YQhiuKlDoSmz{>y8#JSiL)D-B=EOxo+0+S@OVrp zCnKzUJ*evlje)i=;)N3bnL!kw;IO~&<+1&7sV(S_PnJ3TX-R1q9aqUS97TqotU^lp z`U`)?$5+4k5M3}0oSW8rx6SN~KK@jTDj((Fcx)*BrIuusF}*2NwClO=Oz)?v88NQ^ zlX9#)eImGO)^515650p+3m7h0MBksm50nttuO#LeeGNQQ6VY*}YHCjWIsTddUUJr( zc%Cb8z3fO-2w}CoF~D_CA0`{2%bPJ%Zf%C;3JknPOjmz2$VZ|=$eiZ$zt2d6a*!kB z|28`vw2!NQ;T@Ez6Ibq4Cj*zB{FTgRU%`zTzz(sI|Jn z?_(}$7v5EZ9zE)(%k*}}irp|{;xZvJ${G~pKzzkla4G@pcp3-vZ$!rzk+io)7-{;F zaPFhH6(?gHlf4?-O4q8icZa=l9!8{3|0qs*tt=pst}}&c(=gxaX0`FD_xJ}s!Z4i~ ziA_USU`YY&bo2h6GF#Scc;}(4JN)6vspfUIX#~mH7(3GX1d4Y*^FHLVH$`j^DV`HDh(+ZQ8QE9HULHjK10P}EDrlgFQ#Vw z<`#9>8Vns(^vZ>QJB?Z%N;vwyIdrMA;GURi8dsNUkw@BVV$Qqc4fe;K~LsH29Zh;UD zQu@?3MlP;x3F!B}ng@eN!JxX&TNvof=BAtB=Z-8{1Lgr32yLYm*Dlw<FN9I8fv$?uz9>%E)0G>B%<7Yt2cYE zki|z&*eAJu(AJ8bA@|`_xaN{G8;AXetj`XV$l$LqT2t5dRw#YQ^0gVz%iVWfo>E6a z{!C=pv!dx^*~8K5nAZn!+H<}NyXWvWcupu@csre9Pr*#P4ghU>aF`mb5izwQ6R+QM zn*FQ%HJ^?cY7N9Q{aOFbU8Wgjg;8*x=H*XA1P@q8aI!CT#5|o{$)+9X$xp9S*!kK+ z_oC}nRl3te`k@cQ#$7F968X+7+%y%mS(`IA>Ue$d-e+AKj^204`r;F%@%7v3?QJk- z65eR>w=k-SzepDFrF))iJr%c>GXX1qCim9{`zSQiKPfS;5Dfh*XbU??X_Ub*4im00a*IZ}H`%*smX9&9>t63+o zRQv>9lM^${n>BTbw=nz+n+0pA^pu_eJ{?MRH*%+{YJLVn(O;1{_Vfi7Ywl+Xw~%C1 z0>`N(F~o|H?14{Gw7wR%pK>(=zkLAA+j>Jk)+uBRfk4PkW--8z)WPb3BM>~u-Hvl< zOX`BTb(v$O-J$#{jf^23r`OQhhD~3%gO5si5}xA0;&2U)3&GwnLnL3s%p3+=d0mNP z75G=2)%mBx_+=CxJ#SpnmGWLWaw_4wh;Fg;PH`o-94hhWgAiTs285OdNcKWe93wi1 zy5hs~%kRRt7+KYA`J;EIx(M@)B98Q~_Roz46$|?sVdX{YGvC$7AR4qzEJn$U!vaB3 ztC1NnP3+t^QHi)tTBP;Ypi;R_$=kq0^1XYCn!cqLfWld4Kse4;SsEr6X(W*uB%5-s ziE9hY)zmG`pxq{KM|PiY!F}AFg3^hJ9xEE&ooEmfkz8v#YWELp6N*Z^!)m{kCN_}g z?4r0AyJY=WH*AfZaZ|)Sh=vV4xjdZqU`C)`oKV1T)LJkl--+myz|~xsT6P>J z>2c*UU7UKNe|s+GERZ>qH2ApQy|E<%~vv~xuQkFkN3LoyTGP5YT7Vd0)Hn{cF0&glk#IA;tb=FI)ysa(Gx_YmYKTjN#}j z=Us`c|5kUm;maA{(|nRJ&M((MH=GaB2#B9=@bhWsXYkOsg;jN=Ydy2f-G!YVU+#S_ zn$LSlEOCTvpvjSeHt!x}OxkEii-VyOx+4ZwpE~V}Jo@c46+*-oXZuPGhfwHK@z*OV zptku1yl|du{I(&b|7Emf{}Tv|qsASu#ojn0ev>$sJ8xHV!_h9g+EDJUj!RAJ1(w_v#?g4o_v0gjBRQX zP7MuZXCM^`tzC*U|7Y;02NSp1BZRwuz|1(%7*k4QIgC_7f2*T|ub>y?mKjR?lr*LX z%jWGozi5@xCct>)ot2fye;QpOjAWe0i8|;>@=aMi_dR0>4OfUKs?T7^Lk=T$mc}wB z%|Q(=Tg?*T0*V1S~uN7U)sRq8wVsy3^vdMfGhSB573WD)sL_svNr;yEG8qff~j$&vBd~} z+fB2&J|@eMCx00g`!R~I7XTdYG+^DZZq%jcOS5bTJu!6sv>kJ zH6KyhE@<=;_wkVg6fsQ`nb4MM4cZm3HJPS=SWDUsOR3XneiV7HZcR@{?-OLj=v2zd zUMupzK$S@STuh6IemsQ~-}9Ur+v7i8@pn;%Z@>=cOdt#A@hsexo?*v1JcH~?cU!Z$ zez%CO%S9xeJQ4bD)Rc2rD~#V6Mf4jrp3vEF=3K6_#5_J|jLudOCNWHoQv%N_?MG>@ zv*5cPSE+WtgRm{)lU__hq8(DApTofL8w4S7;9xf34A*?UOLv5;V41CN+82HkI{N%K zhJ=TMI_SvcxuP(6^7>Ze=yo~Hn@Xp0Hc4s6jrIK7c@1)pnYj%31>`xPLU`diS2zz~ z+|>8mjK|6s(d0{O@QAvzZsy*z9k_*japaJpFR@i?GP~N6I}X~RIKlD~E#}3%)n*?; zjp161^4)yIN};bwN!DGi%#WGaLvq5kL2u{FsS7C3zjPg6I_0DM0}xj2S?h&DdZ?U6 zj;gbi_)U$q-$xpZeFT7p@l%f;_;;-e*WSboYnS~3%G`~a&ST3GfcA-yY@%T@Y(dsk znMW^@L5h~<>`}yct+AwyuWHR{Gn8YIL~z`|Do?-R3-oXEG6K8R9rmZuAx-^SkcZCk^SJZRfuD6e&)1op1sCSo~+u*fT+T%n%%+7;xVz zz$j#!KT&J^+*%xTEE(&Kg%G}L>#l%yw)f=tUKj7^cb4ag-@NEqkJ9>{I%}n5@r5>J z0g#_s;|NTiy*WBc-;ybRbcsGt0`y#qI|)o8Sez^W>E#e>8*`h)!}k0DPVLi+%5D<8 z;x*vuO0?!^82S`GKR4cEP)X5x0*Zi1{8Q=q-F1=)fx}z`!M*)VKR?Ait9s9oK1l>G zcL<E zIQ%SbdL~|g3r%_b$qh`wMp7By*8T<^JYtl5z4is0QcQK4m0J}d&51X|1;c}($gPxTMyRpy?e!FUBP32( z%%FMtm?~t+KmdnNpc^Y>kr;Z;jsA!1mdYGS?q;EVxt;%xQ%i8InV7{c^Wi3CQ9-9p z%x=M3kTPAw^Wz-R5Hk=T5ltn?ulnfdD8y1fKtxKWOwgnMJ zTGT6ED)x;0l<%?=QQAy*w*5Hx2aeaXH*hD2&LAMvz*L3XOjR@NcbTFgkLwdASd93zD(I3m30$5VP`0Gi z+0tuxQWP?qo^=PqBbKzsI29w-sqgSDw}s6BEW5jzq&BvCwA(Dwi{hj1jEF`R5jcPg z7@KS|re0xju=ZvGpHS~!6-KsU7CMFY(rgkAsw+mOiPCN%;cnXULrw_m`MPwvOYX#@ z0Fj+Mx(Q#g9whcxBgqw)#vMX>a_>+uyFK~zW>3W!8b13qH@Nv!_0)ZQ_6Tlt zIg~n)<51_}OA<6WF&{LooY}t{akIO$n1DdGDloxI+mz|!UW#wyAfggKiW|)?kjAXw z(-r?*2Di`bjd>}8%}a`JHZABL%TDCbbxRUS474RUB0bHW3EGDrwP~O#JU)-#YmdNk zaMXX4YlM-)0VFB~%AXPA4tsM@df2BwJgGxPe{qvy272t2N=4jKd-UUec#L=57-@HCjSf>K0Z5qbf^ zV4Pb%M0d0fy;aDiM7^cIiSNb$G{0d%upnI)#y)Uk;H=rx-q^{~YIOx(b(iwJg4CbP zqCHRxZp{eirREvxhHRFf3y@6{V2x|$IqOI23~7`q6xWfLcu(d*yvQ9|foHA^A%yT3 zrR&l69L3aSpP-@n{=--0%`!2-on+~*vM%Ck%vx2Co;G}|Y{H4YrfXMnVzi>gBIXno zF*|v9V>ygF4cgr!_G$75X41Gf{PuxMs)RRAS4~=*)1dy>V+`{GuL&h%IX-L>n!gc0 z9LH$Kq}ilBV92ir=gi>uA5CNX(n02xY#Y(AoiU+DA^HGg?_{E=VUAQSPUfkgpiU`K zzzwp=@Jt-pVd~hVoQsWzUXZyD7Ws03L;f7Mjm#0ZZ(ivn@HrZ1uAPBjU7H5DWIb>G z(O5^xn93(@&J!q)acpqt4XGeZKmAx5a?%0x>m}{;Y(j#dq;K6JpQmE{jci(`j7?eK zAFW`~aj9eAzwIp#Hf+;8*B1WS!3LiTpP2V@dS#v7GcXi_3&Y@;z!Z@lrckk$p~{RaH89gLw^^T+I#R~E@o|I zw@mzmHG41VZ4D(!5=*yk5Q(%ha9*I=6I8l)k3A&hi*kYjOuY_O{Uqmqlp7QY%iJZ% z55mL=75*pvc@n=AnD|f7g4nz|=kX4+&&gpJ1M!9RoF9RFZI!qSR&U;jh1!XY-@%fF zs5AeuyZ@3c2+(7_6*WSe5lHZXg*BYrR(Loa8tt%=)>35oh8fTca;Kx;OCTZTFLjOl8Ka)yGEHEa|Z5O=kUn--5EPFJWeMbM z550i9 w1!^AlqWV@va3t!6?B^C_g_haCaU!DZdQ?W}IF*e9=&AEtt{T`n*FzJ~u z2;iL|L?h$PW^t`wF$wP!>&MmHDlcvv2Uvc=uln zxYz!GoE46YU6nyXcHMWNf_FbPGHgnX<6^U#yn?W7H}-H7^C-}>PmT)T-fNh^ik7in zw>0N?dWKK#2GlLc!&8UwHeSa*W6YxV^HEMj5u^c^gG2JS312uvb}MZ4pPex;vW)jY z%DAt|XJ)hSkA3Y9`;dzfiut`)yC{s5Hz2FmwmbEjGvEC|FTWt49PwAr6u}I(a-9g8 ztfL{7lmt;2Vm@hMm4oRJ{(%{KiSY9@7Gh;QUg%5q!jPZ&8is>TXyYU$tes)#ehcM8 zR9%sX;juNoKJp9@{&|8ke(4O0-XPoK!;s~9VFrZfPjO;^!jYfpcbYSab?{;vX}_)5QjqL!#~I{1ITw<%=ch=yNU^knkA z$ij`#VzHX(Y!dj;GCy|l_v`FQQ(2+aQ?h0&}nRSpT zkYITJp_Ym}JEI4Ie>6*a7ex}pLGG^{&=~>^y&M&Zzdg0J=FA(I`ud=>TiU#N`151t zo7Ajs4j}kkg&i9=IPt1DacC_EkymT4p{eJ15VY+E?d^>S(HM0C8a8|#>u}NLc*XyC zi`=W8WHU5U+`e&dBEJeRZ@_O51UqsdRiC0em}2x@wJB^yy|~#wjmpc|LNT#fe+hOi z{FGE{%n`+issZXi_CgH&kP2OWZ{VrKNlE;ssv!YV#uLk0Wmh_^;%9~|D9Dh2O$0>sHYaqnYf2gtV_^mj)HnR*``w>sT1;HyqjaN4;QGJ zxJ2@XX?xgTv<}u4_C`+kNH?eRxTr5ui`Zn3m?So9PbERY1`rB%dbaBuC(XESc3+JH zQ%**WPApRDf}woDi@}yT5eLke`3{IIu@FNlSOxi#+f(Hyg;VKxR$^^k$auZzO;)6G zQl`D_(#U$oOHPy6hPBbeOC~Uvjx<1G4TV{>OOrTw{~0CgVt@ECvX45dD{Y=F=B;>{ zLrzvvah)HzDnrJzv7bc2@Lj)l$a>n|f#gj%3%B^P;Vc6aiO&U5RuV!u#C(924L5g( zJsO_G6f0)$^&{?)@8D@q5zl<#-Iq5#njmk8wcP8SF!Jn*WoYBHp;jgd=c}6IYkzKc z9H~-(+Cugbkqb52EFlb+?c*6$R;4Awqx7?Zj8^iPLD}Fyam8fwg{o1v*6TnjG3ml? zI4|9@cKr}q;Gt6boZ&fb$s}m%@-ayiXJ;j!yvr`3NfxgHo*$TEd%kl)8DaEPLA}yz zwSR3$I?t)vBEr|>Y@`W2tnhSI5emME25((djM{>`pz6SaJXDJ)5pdGL9?STrtpo7i z5IFu-ZIOa@R^`voMM$l1c?=6?;WppkTnkaCAX_II?$K0hMSmU%Y`6!>dj{T6SErJC zamu>>^cS@GnY!YjtCFO;L)teK^EGrfFk`D9y_*1N(i=-hYw-&+s@R8Vk8oPS&xLkJ z|E{vwc>Tzs2i;PG;Zix{jOw% z?!v6?D$G9 z5D(&TejsQyr3WIgv)kKN7$JdR@7+sL7D%morTHh@eTRpJ{MMRO;ti|5i;%7Oi&N3!yNp(ipjiv4DgL~fFgrecB|4Nu?)mmPH#*-UkK%EQ&ZP|v zXMTL4koc5ew1~)+Ou*~#vwC7N(l2ZUNWxl0>y({Ig$IWE#|Yoz@zWydbac4DhYOU5 zQ*2}9wt)tGudsXk!I$6s8h!V2Sj&eI*aT-YvBI=zEz{n$>CBU|#E2XY^qq;@h8T5X zkGIIogKTEIx*?|ibg~S(%2g#XD)hdh#h#;0$3>LQInxi1$0Yvqs#W>d1b5@VCoe?( zL%y;$JcvCix_>*fhCi&pn@joj+K6hR$5pA=YfC`uSt=2otd4U zi86>bpyq>-l|UE&F<$~50*QFOsvwBg5Kx<;?E!X41j;|deJ6yeLDU9zf*pkpJbMqS z6G=Z&)Dki0!?BU7JpXmw@1U5yG1T{D*xd1P(;H#}n41)UXrYy*gO@MSl+YDW2&OQzKF>__*MVfzn6*Tg7L{1VVqA+`?3fF1>$WUInSN$BM@lZjIJ}H zB^-PcFzirGAd-e?Rx;pxPT2d6kTE$BIqlF1?5c^w@iJwy7T>EXG}e*YyPFR@k$wAX zAv^-n*&acs_n9$Aa+8|>8|Ronms}U9OM8%?Z~HfIRW(Novta=v*CK!od^$n%Mv8`t zYmWV~EGPb?E)02GrARxu!uTsfd6Ul1_m9+|aw*qLrL|5~99oB{e!2zcJVv2DhH8xW zFnbNQd^qj{_(ONnpd`pI5ujPQjof=KURS)}md9PdBc$mmoacf%5@eN)l?t9I+hM_s zrYcWuL{QNgNu<412bNW~P z!ArdlnpKQp#@oUJKJ?e%B73^9DY&SJwb?D`in7(6ksl$Hi4)XLzmW2hobXlbUD%}J zs)W4hgI+xB|2U*%Q1FG|rWzv~>aJ2dpTtF;2QVTU-Z}J#n~gUTW z4)H>_Ho31Ln;nmdAi9@hEJ{m~Kk&vsGGQW+;|<%w3D9D*L#lN>7#!p5ZBN$frdg zsvkZ%6YINeplrZhpEJIoL5{H=P?1fbJ(%)m+%i!{Xd{llWJi5V*Mf9B@KWl5MtQ;3 zi+!@rJM!CUoqu1~UyT);Bwt}P^eG5iKwp0O<%Pd(gXN7Dkvy}Wy$NUUlun^2*uj>H z8F=dO=W@PViULCTUO#p#$5Kv_m=SjPk>nJbN>TvI%2VcwiHt|F?O?~xSP=!HIPQ(( z5G@o`0M|Lm8;8wz5?ejtgxBlCmuub1c{q}ysb zT_s_5!FKFjfGD=jJ!h;tp+J4_9GN zkfCLCpi*Y8G~K3eI5G}S{ki#+?(NNai?INt;{C$yP1Cma-a$3~<5 zYKAM3N)(v(UZX9AOPw6klL<4lsUJ+C2FzczL@RO<)xJH|9h|m+n0YB`PV{_kBGa{C#}(mJaDVW4DkX*D(`W9|_=nAi0(Ha8UIoY5OH_ zyWzv`@&I3Is+9+OovLd!+FS2aTX7%;Y#|`q#~OkkkcdP9wr|bX|7IkWBI*2Me!!>^q4-S$+?NDW z9c+anZ?yS*Iw9~^pL<`wL=0*3NAoKrf(e~`c=-(xpHz)2A4_R3VCvwDxX3=-x~;~7 z1Y3qmWYUllxg>>{MDrZ>xmLOgQu8`z2TJeltRzs+R)pR+S_2FYrj#(V^tjVzZ@Ok? zJKsYTgWFc#o8x3@?#xK%LT`QE4SNo{Du?=Nm{vn+!1eS;HO*0`7mUX8oK=0&#JgvY zkKgZ$T+4Cr2MHGF4MG+>Se7{$A!2fJ!6_gMy$4uvw@ec2PP}gNn+qC47uKjXcYy*? zZC;!2A4->Z4R`vW_Z_H|z(nm>Ml>(`yGLhBLIJ*dR=hHh3uBEHrH4}?ENE&DAgWAz z3PyYt>av|Le%8&Eoi~!YZj~1}X+rMJ1+|P*^oS@k;&!z(Uk2zRD1H}Fh!+~ zWbUpb#Q!nuZ^Ncf3JfKMjsZ7Zr~mOJpkQM%cUSUg|57|e?nMn~o$wY97{%$*`dThB zDBGqOpG?r^ySfs%51!O-Z=VLrAivxdRyi*)x}wdx_>CBPFV0J?>cMtjU1K!bRr?(| z99GT9L6E3K!D)17V+mYA8o)O$a4S50A+j_sih>mYzxOn}n{yJCbPj_FlIL{XJ@F1b z!~TR;=zYDJkRd{z!+<3;Jt436cNNRz%G2@(TVy-4c`C48tFRYBAvwmwoLqq?bh z5>Jz^wh2Kq*MT1}b>-$oiRP|O6P z`ncVEn8b*@XrNY!O3u6y^5QPs^uU)h^TPlrOSuKRf7NS?8rZc{i0l(>Uu=AJK(!*FnADo;cRPC44IV98Rf?-_T68dH&_%gG zW!TP2q=O4L!MHV_p;k`?;r$o;$ZViXEm9fadZ6%>YKyH{H@!xl!99IXN)K;L;i^?% z-JHi}EB957I7f_CqEdEz-pEq=t%?t{$hRQl!5EtfTZDp&!Rcs^q?bZtj?a@2lXs3- zZtzz?ai|^sI%cK)7#79al_OIp=b8DgzTXHgMSq00h^j~RfsTX5T z>`lI3CQ*HL@wzx&FRbvA3^GQAj-PdOoPw*3y^^S)o4w=a9kkfuB08{{?0`1AqedS@ zGns`obxfRY5UNU1vk$%8CVez-_Js$~p~fe&jnj6wlx^52HQ(^5&Nl}#G9oRT%l?A|8SVRkL$lb!dW5Fj3M_%^N$Yiv zfWA$T3*hFAkVtbf;~ogl_;s!EE+1(~CT#Dx?qfxiuFkLDP9;FnJxR;*3m&RINXPe( zK}DXvffoyzX;_ZK@WT>GNs}8yhi9r%Eibsumz@Rk`oyOLuWzpLZGhE8^&5Yt;sF1O z$(NLY{pf##t7Z8=xLN=QEBpVOvTLgwImmV4F8YPn zdD6xoJNDk*0P$Md{g?M#%gq?{;&OiBcYQk^S7HCsiRz^_=p|omBhvQeR?g<4hbz0?1XFj68Ki=-Q7GBp0_&FFZ#7l*~;w~*BRCD9E#NxKRWx=Ov z)h;yDs0W^`e;mjRP}O!scHHaezNnWK`C!l@Cdo>GDifoSBsc)B17|=;@|%YF5f^4757-UhIuofNnjR@MTlI;bMC*-T}tFV%@kX?Oxs zxTP2Pr&R!U*5kt;p?XR2J--JW>xJ zJ^%1+apg4QqUJ|`9>?7>wg*usJQmRzqwkk3$i=p%w35nr%=#Pp1E#B|E%^B#nJ$h#h8USs5; z`kXKe(u7{c8A2c9eKjIJRFXe1q*v)h2QLeDJn*)AXGxtyJZmcULkiJh z#aZc7xAOQI?` z*_*f;f6}ihKfhg3G1d5_X0rh4nOLdd7zABiEbN^qsTfo&U2IH0&&sYwR;I=-pFQDE zGPcX-JsoCtw$H|{0|WwP{)*vWQTZ#Te?{Z3=m1$h8xZLBS1kX^KfM0O?C`%9yT3h? zzheJajHwu;O+DT1olKl5KbP~5IW6t&L_Q(i07?-qrcd-WknvNQSr}P3=zts)Kp@4Z zpRB#fe{xZAGIVe-HTfid8`?OV!Z9eQh-%P_x!Twm8QR(XjfIMZr8D63`0tp|0@O^M zoIj@qFw--#FtTzov9U7KG10UAPq01*m}&rIEZs~2f7inxVrpb*Xh#LGb})3Ya5gmr zxczfH8$C1qXJ|j+;{RlP|108K#nZtQz#wmArR-u0#~=q_Wd6(Uc5-(4ETjJm_GV`S z{=X|~sEVxpIuPOKi~8k?QN$hHx_~0U_Sjap15>iu72fF*kCioHC9oP~wVTkATeiVy zy*EsxOPp7V6w*N7A6Y4w;WabHz6tJxJobXf@=Is6tJ{(cQjgKOB zfBaE}b2_NJu;GFriDtFRJ_wr$VfZBpp#iQKt?UNkF9r}qT+QF0&vPx%iwuHfWXD>#FF$ddse!b-n?1{`~3d?#^Gcq)GK?aGJWV zDweL=Jn;3jN8R=f@y8A@0ah=z#0rb4j|-2DbFm;6o%X=yrWa#8%}?4q$sT})a64;W zNlchJ{tcX!TARo_6}KPdDWE07+$5+uvIhng~@kFrFzgf>MrTal8H)D zMf$6eou^epV+HU@7a`C(4~?i=D(s|CGgema^+|bQ1)ncXZ%6BNIEs_1ORdfCAFj$2 zEz%@KXX`)8aOGD6hVU*L{DRG)PqphW84wXOFx%7Qol96{Dj7@z*}hc$Hj@r6Dj2O7 zHKe>&uD5u0^OKFr#fmjAq9&B_QG0R5vtMX8$2AGx!86y3Hg_Lvzw{S)$nS4;Kr{;R zq&eW~H?K|=EUh3Ef4k~)<96CKW;>PMC200jP5J&|xI1Qe9$f)ZAOC`Te5^b2ZweXb zzZA0n&K?aVQ!_XQNjno$4}doCGaGf`7?dr&{^lhdgE~N)5x@js{In_A+q-restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration & Utils" + ] + }, + { + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e4YMZrBuFycl", + "outputId": "d7ee9449-4f21-4c9a-fa54-14b7f28d764a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "33345487", + "metadata": { + "id": "33345487" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "## Configuration\n", + "class MyConfig:\n", + " pass\n", + "\n", + "MY_CONFIG = MyConfig ()\n", + "\n", + "MY_CONFIG.INPUT_DATA_DIR = 'input'\n", + "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", + "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "4d5511fb-1c6f-47df-e5ea-2c1b354d262f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "shutil.os.makedirs(MY_CONFIG.INPUT_DATA_DIR, exist_ok=True)\n", + "\n", + "output_text_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_text_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_fuzzy_dedupe_out')\n", + "output_doc_quality_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_doc_quality_out')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "print (\"✅ Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "14b2f34c", + "metadata": {}, + "source": [ + "### 2.3 - Handy Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ba47a370", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from humanfriendly import format_size\n", + "import pandas as pd\n", + "import glob\n", + "\n", + "## Reads parquet files in a folder into a pandas dataframe\n", + "def read_parquet_files_as_df (parquet_dir):\n", + " parquet_files = glob.glob(f'{parquet_dir}/*.parquet')\n", + " # read each parquet file into a DataFrame and store in a list\n", + " dfs = [pd.read_parquet (f) for f in parquet_files]\n", + " dfs = [df for df in dfs if not df.empty] # filter out empty dataframes\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " if len(dfs) > 0:\n", + " data_df = pd.concat(dfs, ignore_index=True)\n", + " return data_df\n", + " else:\n", + " return pd.DataFrame() # return empty df\n", + "# ------------" + ] + }, + { + "cell_type": "markdown", + "id": "dc1972c3", + "metadata": {}, + "source": [ + "## Step-3: Inspect the Data\n", + "\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/input/)\n", + "\n", + "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/earth.pdf) and exact duplicate [earth-copy.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/earth-copy.pdf)\n", + "- [earth2.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/mars.pdf)\n", + "- [spam.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/spam.pdf) - contains spammy contents\n", + "- [lorem.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/lorem.pdf) - contains 'lorem ipsum' placeholder\n" + ] + }, + { + "cell_type": "markdown", + "id": "7113b16c", + "metadata": {}, + "source": [ + "### 3.1 -Download Data if running on Google Colab" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "23db1064", + "metadata": {}, + "outputs": [], + "source": [ + "if RUNNING_IN_COLAB:\n", + " !mkdir -p '{MY_CONFIG.INPUT_DATA_DIR}'\n", + " !wget -O '{MY_CONFIG.INPUT_DATA_DIR}/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/earth.pdf'\n", + " !wget -O '{MY_CONFIG.INPUT_DATA_DIR}/earth-copy.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/earth-copy.pdf'\n", + " !wget -O '{MY_CONFIG.INPUT_DATA_DIR}/earth2.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/earth2.pdf'\n", + " !wget -O '{MY_CONFIG.INPUT_DATA_DIR}/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing/input/mars.pdf'\n", + " !wget -O '{MY_CONFIG.INPUT_DATA_DIR}/spam.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing/input/spam.pdf'\n", + " !wget -O '{MY_CONFIG.INPUT_DATA_DIR}/lorem.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing/input/lorem.pdf'" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-4: Extract Data from PDF (pdf2parquet)\n", + "\n", + "This step we will read PDF files and extract the text data.\n", + "\n", + "[Pdf2Parquet documentation](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/README.md)\n", + "\n", + "We use the [Docling package](https://github.com/DS4SD/docling).\n" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 4.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 657, + "referenced_widgets": [ + "97b603697cfa4b4ea4e6735b6768ca35", + "e87e8d3262c54cfaaa8768505edacda3", + "b78aa40816e44f7fbebcb24ca68818b3", + "7053c9606a414e978636a7e241909504", + "da0787b239764847a731083997780a85", + "553f3c16839a49d79591d0fc4862bed6", + "c0eb5bc8f6ee427ca42204b3c56f9a4e", + "9d184ed175f0403fb03c2e13dfd04e0a", + "724778729161445c98b187031ae4f67c", + "1cb3bbf7d724411cbe9831543a4aecc0", + "06f9b33494984e4885d5aad813d1d2bc" + ] + }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "01d207fb-983d-40b2-e5f6-e38e3789110a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_text_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:15:20 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "00:15:20 INFO - pipeline id pipeline_id\n", + "00:15:20 INFO - code location None\n", + "00:15:20 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_text_out\n", + "00:15:20 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:20 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "00:15:20 INFO - orchestrator pdf2parquet started at 2025-01-21 00:15:20\n", + "00:15:20 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "00:15:20 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 129720.74it/s]\n", + "00:15:24 INFO - Completed 1 files (16.67%) in 0.017 min\n", + "00:15:25 INFO - Completed 2 files (33.33%) in 0.032 min\n", + "00:15:26 INFO - Completed 3 files (50.0%) in 0.044 min\n", + "00:15:27 INFO - Completed 4 files (66.67%) in 0.054 min\n", + "00:15:27 INFO - Completed 5 files (83.33%) in 0.064 min\n", + "00:15:28 INFO - Completed 6 files (100.0%) in 0.075 min\n", + "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.127 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:1 completed successfully\n", + "CPU times: user 21 s, sys: 1.65 s, total: 22.7 s\n", + "Wall time: 10.3 s\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from dpk_pdf2parquet.transform_python import Pdf2Parquet\n", + "from dpk_pdf2parquet.transform import pdf2parquet_contents_types\n", + "\n", + "STAGE = 1 \n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{MY_CONFIG.INPUT_DATA_DIR}' --> output='{output_text_dir}'\\n\", flush=True)\n", + "\n", + "result = Pdf2Parquet(input_folder= MY_CONFIG.INPUT_DATA_DIR,\n", + " output_folder= output_text_dir,\n", + " data_files_to_use=['.pdf'],\n", + " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", + " # pdf2parquet_contents_type=pdf2parquet_contents_types.JSON # JSON\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "5ca790e0", + "metadata": { + "id": "5ca790e0" + }, + "source": [ + "### 4.2 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fe59563d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 + }, + "id": "fe59563d", + "outputId": "346e0584-bdde-4705-8c2a-f3c1582cd7e7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/01_text_out\n" + ] + }, + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filename
0spam.pdfFree xxx1025a9d562d-ba87-4b2f-954d-fd9e7aece50910026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T00:15:28.4191840.660825spam.pdf
1earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10116fdf34dd-3e36-4311-9d85-eaa2c22146dd10729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-21T00:15:26.5122790.691329earth2.pdf
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10110dd165a1-6de2-4df0-ad7a-c6ad21da9c187758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T00:15:27.7567190.615258mars.pdf
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101127d9fd2f-d815-4937-bbbf-d4a10cbce4c614711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T00:15:24.9153761.019706earth-copy.pdf
4lorem.pdfLorem ipsum Lorem ipsum Lorem ipsum1028a86db6e-0ac6-480c-9a86-d4ac383e25896571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T00:15:27.1397790.625818lorem.pdf
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 spam.pdf Free xxx \n", + "1 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "4 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 11 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 2 \n", + "\n", + " document_id document_hash ext \\\n", + "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", + "1 6fdf34dd-3e36-4311-9d85-eaa2c22146dd 10729312978404042321 pdf \n", + "2 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", + "3 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "4 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "\n", + " hash size \\\n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", + "1 2025-01-21T00:15:26.512279 0.691329 earth2.pdf \n", + "2 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", + "3 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", + "4 2025-01-21T00:15:27.139779 0.625818 lorem.pdf " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_text_dir)\n", + "output_df = read_parquet_files_as_df(output_text_dir)\n", + "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "output_df.head()\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 4.3 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **document_hash**: hash of documents\n", + "- **hash** : hash of `contents` column\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "**Note: you should notice the hash values are identical for the duplicate documents**\n", + "\n", + "Let's inspect the **contents** column." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f870e624", + "outputId": "0b4c054f-3a8a-4db3-f32f-17bd1466b102" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Free xxx\n" + ] + } + ], + "source": [ + "print (output_df.iloc[0, ]['contents'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e1a10c2d", + "outputId": "c1d992c2-faa8-40cd-c375-857970201daa" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Earth\n", + "\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "\n", + "For more details about the Solar system see Chapter 1.\n", + "\n", + "## Earth\n", + "\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "\n", + "Basic facts about Earth:\n", + "\n", + "- · Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "- · Moons: One moon, called Luna or simply \"the Moon\".\n", + "- · Rotation Period: 24 hours (one day)\n" + ] + } + ], + "source": [ + "print (output_df.iloc[1, ]['contents'])\n" + ] + }, + { + "cell_type": "markdown", + "id": "7fc86d5b", + "metadata": {}, + "source": [ + "## Step-5: Create DOC ID for Documents\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This step is a pre-requisite for fuzzy dedup** in the pipeline.\n", + "\n", + "[DocID documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/doc_id)" + ] + }, + { + "cell_type": "markdown", + "id": "f516a253", + "metadata": {}, + "source": [ + "### 5.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cee20521", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-2: Processing input='output/01_text_out' --> output='output/02_docid_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:15:28 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/01_text_out output_folder - output/02_docid_out\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator doc_id started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.049612998962402344}\n", + "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:2 completed successfully\n", + "CPU times: user 20.5 ms, sys: 2.75 ms, total: 23.2 ms\n", + "Wall time: 18.6 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from dpk_doc_id.transform_python import DocID\n", + "\n", + "STAGE = 2\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_text_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", + "\n", + "result = DocID(input_folder= output_text_dir,\n", + " output_folder= output_docid_dir,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"doc_hash\",\n", + " # doc_id_int_column= \"doc_id\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " #doc_id_start_id= 5\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "4bd6f382", + "metadata": {}, + "source": [ + "### 5.2 - Inspect Generated output\n", + "\n", + "You would see a new columns **hash** and **docid** " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f3d4aba9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/02_docid_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_column
0spam.pdfFree xxx1025a9d562d-ba87-4b2f-954d-fd9e7aece50910026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T00:15:28.4191840.660825spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5
1earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10116fdf34dd-3e36-4311-9d85-eaa2c22146dd10729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-21T00:15:26.5122790.691329earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10110dd165a1-6de2-4df0-ad7a-c6ad21da9c187758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T00:15:27.7567190.615258mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101127d9fd2f-d815-4937-bbbf-d4a10cbce4c614711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T00:15:24.9153761.019706earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0
4lorem.pdfLorem ipsum Lorem ipsum Lorem ipsum1028a86db6e-0ac6-480c-9a86-d4ac383e25896571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T00:15:27.1397790.625818lorem.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 spam.pdf Free xxx \n", + "1 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "4 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 11 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 2 \n", + "\n", + " document_id document_hash ext \\\n", + "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", + "1 6fdf34dd-3e36-4311-9d85-eaa2c22146dd 10729312978404042321 pdf \n", + "2 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", + "3 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "4 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "\n", + " hash size \\\n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", + "1 2025-01-21T00:15:26.512279 0.691329 earth2.pdf \n", + "2 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", + "3 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", + "4 2025-01-21T00:15:27.139779 0.625818 lorem.pdf \n", + "\n", + " doc_hash int_id_column \n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", + "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c55f8d3f", + "metadata": {}, + "source": [ + "## Step-6: Eliminate Duplicate Documents\n", + "\n", + "We have 2 exact duplicates: **earth.pdf** , **earth-copy.pdf**\n", + "\n", + "Note how **doc_hash** for these documents are the same.\n", + "\n", + "[Exact dedupe information](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/ededup)" + ] + }, + { + "cell_type": "markdown", + "id": "6f5ef1f7", + "metadata": {}, + "source": [ + "### 6.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "90eddb4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-3: Processing input='output/02_docid_out' --> output='output/03_exact_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:15:28 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator ededup started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.05621051788330078}\n", + "00:15:28 INFO - Starting from the beginning\n", + "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:3 completed successfully\n", + "CPU times: user 25.1 ms, sys: 2.26 ms, total: 27.4 ms\n", + "Wall time: 22 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from dpk_ededup.transform_python import Ededup\n", + "\n", + "STAGE = 3\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Ededup(input_folder=output_docid_dir,\n", + " output_folder=output_exact_dedupe_dir,\n", + " ededup_doc_column=\"contents\",\n", + " ededup_doc_id_column=\"doc_hash\"\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4aacf09", + "metadata": {}, + "source": [ + "### 6.2 - Inspect Generated output\n", + "\n", + "You can see one of **earth.pdf** or **earth-copy.pdf** will be eliminated." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1887b26d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 6\n", + "Output files after exact dedupe : 5\n", + "Duplicate files removed : 1\n", + "Displaying contents of : output/03_exact_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0spam.pdfFree xxx1025a9d562d-ba87-4b2f-954d-fd9e7aece50910026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T00:15:28.4191840.660825spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
1earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10116fdf34dd-3e36-4311-9d85-eaa2c22146dd10729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-21T00:15:26.5122790.691329earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2[]
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10110dd165a1-6de2-4df0-ad7a-c6ad21da9c187758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T00:15:27.7567190.615258mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101127d9fd2f-d815-4937-bbbf-d4a10cbce4c614711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T00:15:24.9153761.019706earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
4lorem.pdfLorem ipsum Lorem ipsum Lorem ipsum1028a86db6e-0ac6-480c-9a86-d4ac383e25896571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T00:15:27.1397790.625818lorem.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 spam.pdf Free xxx \n", + "1 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "4 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 11 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 2 \n", + "\n", + " document_id document_hash ext \\\n", + "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", + "1 6fdf34dd-3e36-4311-9d85-eaa2c22146dd 10729312978404042321 pdf \n", + "2 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", + "3 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "4 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "\n", + " hash size \\\n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", + "1 2025-01-21T00:15:26.512279 0.691329 earth2.pdf \n", + "2 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", + "3 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", + "4 2025-01-21T00:15:27.139779 0.625818 lorem.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] \n", + "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "76ea34e2", + "metadata": {}, + "source": [ + "## Step-7: Fuzzy Dedupe\n", + "\n", + "In previous step, we removed **exact duplicates (identical documents)**.\n", + "\n", + "Fuzzy de-dupe can further filter out documents that are **not exactly identical, but nearly identical**\n", + "\n", + "For example imagine two documents with one extra blank line. For our purposes they are the same.\n", + "\n", + "[Fuzzy dedupe documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/fdedup)\n", + "\n", + "### Tweaking the threshold\n", + "\n", + "**`jaccard_similarity_threshold`** is the parameter used to tweak similarities between documents. It's value is between 0 and 1.0. Values close to 1.0 means more strict checking (fewer documents will qualify). Lower threshold means more leniant matches (more documents will qualify)\n", + "\n", + "Adjust this value to find what works for your documents" + ] + }, + { + "cell_type": "markdown", + "id": "79a37713", + "metadata": {}, + "source": [ + "### 7.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "37430b60", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-4: Processing input='output/03_exact_dedupe_out' --> output='output/04_fuzzy_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:15:28 INFO - Starting SignatureCalculation step\n", + "00:15:28 INFO - Got parameters for SignatureCalculation\n", + "00:15:28 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "00:15:28 INFO - data factory scdata_ is using local configuration without input/output path\n", + "00:15:28 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator minhash started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05067157745361328}\n", + "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "00:15:28 WARNING - table is empty, skipping processing\n", + "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", + "00:15:28 INFO - Starting flush()\n", + "00:15:28 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "00:15:28 INFO - done flushing in 0.02 sec\n", + "00:15:28 INFO - Completed execution in 0.001 min, execution result 0\n", + "00:15:28 INFO - SignatureCalculation completed successfully\n", + "00:15:28 INFO - Starting ClusterAnalysis step\n", + "00:15:28 INFO - Got parameters for ClusterAnalysis\n", + "00:15:28 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator cluster started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of folders is 14\n", + "00:15:28 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "00:15:28 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "00:15:28 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "00:15:28 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "00:15:28 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "00:15:28 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "00:15:28 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 8 files (57.14%) in 0.0 min\n", + "00:15:28 INFO - Completed 9 files (64.29%) in 0.0 min\n", + "00:15:28 INFO - Completed 10 files (71.43%) in 0.0 min\n", + "00:15:28 INFO - Completed 11 files (78.57%) in 0.0 min\n", + "00:15:28 INFO - Completed 12 files (85.71%) in 0.0 min\n", + "00:15:28 INFO - Completed 13 files (92.86%) in 0.0 min\n", + "00:15:28 INFO - Completed 14 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 14 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n", + "00:15:28 INFO - ClusterAnalysis completed successfully\n", + "00:15:28 INFO - Starting GetDuplicateList step\n", + "00:15:28 INFO - Got parameters for GetDuplicateList\n", + "00:15:28 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator fdlist started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of folders is 1\n", + "00:15:28 INFO - Get Duplicate List for folder docs_to_remove\n", + "00:15:28 INFO - 1 documents marked as duplicates\n", + "00:15:28 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 1 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n", + "00:15:28 INFO - GetDuplicateList completed successfully\n", + "00:15:28 INFO - Starting DataCleaning step\n", + "00:15:28 INFO - Got parameters for DataCleaning\n", + "00:15:28 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "00:15:28 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "00:15:28 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator fdclean started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05067157745361328}\n", + "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "00:15:28 WARNING - table is empty, skipping processing\n", + "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n", + "00:15:28 INFO - DataCleaning completed successfully\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 301 ms, sys: 71.6 ms, total: 373 ms\n", + "Wall time: 265 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from dpk_fdedup.transform_python import Fdedup\n", + "\n", + "STAGE = 4\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_exact_dedupe_dir}' --> output='{output_fuzzy_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Fdedup(input_folder=output_exact_dedupe_dir,\n", + " output_folder=output_fuzzy_dedupe_dir,\n", + " contents_column= \"contents\",\n", + " # document_id_column= \"doc_id\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " jaccard_similarity_threshold = 0.8, # between 0 - 1. higher means more strict checking\n", + " operation_mode=\"filter_duplicates\",\n", + " # operation_mode=\"annotate\",\n", + " ).transform()\n", + "# if result == 0:\n", + "# print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "# else:\n", + "# raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "b2c83592", + "metadata": {}, + "source": [ + "### 7.2 - Inspect Output\n", + "\n", + "FuzzyDedupe will write documents that are filtered in **output/04_fuzzy_dedupe_out/cleaned** folder\n", + "\n", + "You will notice only one **earth.pdf** made it! So fuzzy dedupe did filter out the almost identical doc." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "573faba2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 5\n", + "Output files after exact dedupe : 4\n", + "Near duplicate files removed : 1\n", + "Displaying contents of : output/04_fuzzy_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0spam.pdfFree xxx1025a9d562d-ba87-4b2f-954d-fd9e7aece50910026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T00:15:28.4191840.660825spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
1mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10110dd165a1-6de2-4df0-ad7a-c6ad21da9c187758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T00:15:27.7567190.615258mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
2earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101127d9fd2f-d815-4937-bbbf-d4a10cbce4c614711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T00:15:24.9153761.019706earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
3lorem.pdfLorem ipsum Lorem ipsum Lorem ipsum1028a86db6e-0ac6-480c-9a86-d4ac383e25896571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T00:15:27.1397790.625818lorem.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 spam.pdf Free xxx \n", + "1 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "2 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 11 \n", + "2 1 0 11 \n", + "3 1 0 2 \n", + "\n", + " document_id document_hash ext \\\n", + "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", + "1 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", + "2 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "3 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "\n", + " hash size \\\n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "3 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", + "1 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", + "2 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", + "3 2025-01-21T00:15:27.139779 0.625818 lorem.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] \n", + "3 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "output_df = read_parquet_files_as_df(os.path.join(output_fuzzy_dedupe_dir, \"cleaned\"))\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3e0598a0", + "metadata": {}, + "source": [ + "## Step-8: Document Quality\n", + "\n", + "This handy plugin will score documents across many metrics.\n", + "\n", + "Here we will look for 'bad words' metric.\n", + "\n", + "[Document quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" + ] + }, + { + "cell_type": "markdown", + "id": "1949c2c4", + "metadata": {}, + "source": [ + "### 8.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b485f598", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-5: Processing input='output/04_fuzzy_dedupe_out/cleaned' --> output='output/05_doc_quality_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:15:28 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "00:15:28 INFO - data factory docq_ is using local configuration without input/output path\n", + "00:15:28 INFO - data factory docq_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - pipeline id pipeline_id\n", + "00:15:28 INFO - code location None\n", + "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:15:28 INFO - orchestrator docq started at 2025-01-21 00:15:28\n", + "00:15:28 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.04009246826171875}\n", + "00:15:28 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "00:15:28 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "00:15:28 WARNING - table is empty, skipping processing\n", + "00:15:28 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "00:15:28 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "00:15:28 INFO - Done processing 5 files, waiting for flush() completion.\n", + "00:15:28 INFO - done flushing in 0.0 sec\n", + "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:5 completed successfully\n", + "CPU times: user 38.8 ms, sys: 2.53 ms, total: 41.3 ms\n", + "Wall time: 35.3 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from dpk_doc_quality.transform_python import DocQuality\n", + "\n", + "STAGE = 5\n", + "output_fuzzy_dedupe_cleaned_dir = os.path.join(output_fuzzy_dedupe_dir, \"cleaned\")\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_fuzzy_dedupe_cleaned_dir}' --> output='{output_doc_quality_dir}'\\n\", flush=True)\n", + "\n", + "result = DocQuality(input_folder=output_fuzzy_dedupe_cleaned_dir,\n", + " output_folder= output_doc_quality_dir,\n", + " docq_text_lang = \"en\",\n", + " docq_doc_content_column =\"contents\",\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "eccefd3e", + "metadata": {}, + "source": [ + "### 8.2 - Inspect the Output\n", + "\n", + "We will see several new columns starting with the name **docq_**.\n", + "\n", + "We will look at a metric **docq_contain_bad_word** and filter out any documents that have bad words.\n", + "\n", + "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1f3225f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/05_doc_quality_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
0spam.pdfFree xxx1025a9d562d-ba87-4b2f-954d-fd9e7aece50910026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...8...3.5000000.00000010.0000000.0True0.0000000.01.000000False
1mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10110dd165a1-6de2-4df0-ad7a-c6ad21da9c187758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.0000000.0False0.1764710.00.880000True
2earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101127d9fd2f-d815-4937-bbbf-d4a10cbce4c614711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.0000000.0False0.1764710.00.880734True
3lorem.pdfLorem ipsum Lorem ipsum Lorem ipsum1028a86db6e-0ac6-480c-9a86-d4ac383e25896571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...35...5.0000000.00000010.0857140.0False0.0000000.01.000000False
\n", + "

4 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 spam.pdf Free xxx \n", + "1 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "2 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 11 \n", + "2 1 0 11 \n", + "3 1 0 2 \n", + "\n", + " document_id document_hash ext \\\n", + "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", + "1 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", + "2 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "3 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "\n", + " hash size ... \\\n", + "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", + "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "3 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "0 3.500000 0.000000 1 \n", + "1 4.688000 0.032000 8 \n", + "2 4.541284 0.027523 9 \n", + "3 5.000000 0.000000 1 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "0 0.000000 0.0 True \n", + "1 0.000000 0.0 False \n", + "2 0.000000 0.0 False \n", + "3 0.085714 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "0 0.000000 0.0 \n", + "1 0.176471 0.0 \n", + "2 0.176471 0.0 \n", + "3 0.000000 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "0 1.000000 False \n", + "1 0.880000 True \n", + "2 0.880734 True \n", + "3 1.000000 False \n", + "\n", + "[4 rows x 27 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "print (\"Displaying contents of : \", output_doc_quality_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "02fa3bd2", + "metadata": {}, + "source": [ + "### 8.3 - Filtering 'quality' documents\n", + "\n", + "So from the output above we see **spam.pdf** is flagged for containing bad words (see column **docq_contain_bad_word**).\n", + "\n", + "Also **lorem.pdf** is flagged for place holder content **lorem ipsum**\n", + "\n", + "We are going to filter them both out" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5dac1c70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
1mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10110dd165a1-6de2-4df0-ad7a-c6ad21da9c187758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.00.0False0.1764710.00.880000True
2earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...101127d9fd2f-d815-4937-bbbf-d4a10cbce4c614711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.00.0False0.1764710.00.880734True
\n", + "

2 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "1 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "2 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "1 1 0 11 \n", + "2 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "1 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", + "2 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "1 4.688000 0.032000 8 \n", + "2 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "1 0.0 0.0 False \n", + "2 0.0 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "1 0.176471 0.0 \n", + "2 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "1 0.880000 True \n", + "2 0.880734 True \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "\n", + "# remove documents with badwords\n", + "clean_docs = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", + "\n", + "# also filter out 'lorem ipsum' text\n", + "clean_docs = clean_docs[clean_docs['docq_lorem_ipsum_ratio'] == 0]\n", + "\n", + "clean_docs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "outputId": "31f09b58-7b2d-48bb-9dac-bc0ba9625c01" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Saved output to 'output/output_final'\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER_FINAL, exist_ok=True)\n", + "\n", + "clean_docs.to_parquet(os.path.join(MY_CONFIG.OUTPUT_FOLDER_FINAL, \"final.parquet\"))\n", + "print (f\"✅ Saved output to '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dpk-2-pdf-processing", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06f9b33494984e4885d5aad813d1d2bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1cb3bbf7d724411cbe9831543a4aecc0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "553f3c16839a49d79591d0fc4862bed6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7053c9606a414e978636a7e241909504": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1cb3bbf7d724411cbe9831543a4aecc0", + "placeholder": "​", + "style": "IPY_MODEL_06f9b33494984e4885d5aad813d1d2bc", + "value": " 10/10 [00:00<00:00, 349.38it/s]" + } + }, + "724778729161445c98b187031ae4f67c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "97b603697cfa4b4ea4e6735b6768ca35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e87e8d3262c54cfaaa8768505edacda3", + "IPY_MODEL_b78aa40816e44f7fbebcb24ca68818b3", + "IPY_MODEL_7053c9606a414e978636a7e241909504" + ], + "layout": "IPY_MODEL_da0787b239764847a731083997780a85" + } + }, + "9d184ed175f0403fb03c2e13dfd04e0a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b78aa40816e44f7fbebcb24ca68818b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d184ed175f0403fb03c2e13dfd04e0a", + "max": 10, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_724778729161445c98b187031ae4f67c", + "value": 10 + } + }, + "c0eb5bc8f6ee427ca42204b3c56f9a4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "da0787b239764847a731083997780a85": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e87e8d3262c54cfaaa8768505edacda3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_553f3c16839a49d79591d0fc4862bed6", + "placeholder": "​", + "style": "IPY_MODEL_c0eb5bc8f6ee427ca42204b3c56f9a4e", + "value": "Fetching 10 files: 100%" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/intro/dpk_intro_1_ray.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb similarity index 100% rename from examples/notebooks/intro/dpk_intro_1_ray.ipynb rename to examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb From 1af9b09310722b9ce4305105f1a574a069b67033 Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Tue, 21 Jan 2025 23:55:47 -0800 Subject: [PATCH 2/6] updated example - updated the diagram - simplified notebook Signed-off-by: Sujee Maniyam --- examples/notebooks/pdf-processing-1/README.md | 2 +- .../data-prep-kit-3-workflow.excalidraw | 3247 ++++++++++------- .../images/data-prep-kit-3-workflow.png | Bin 101303 -> 127061 bytes .../input/{lorem.md => lorem-ipsum.md} | 0 .../input/{lorem.pdf => lorem-ipsum.pdf} | Bin .../pdf_processing_1_python.ipynb | 1227 ++++--- 6 files changed, 2557 insertions(+), 1919 deletions(-) rename examples/notebooks/pdf-processing-1/input/{lorem.md => lorem-ipsum.md} (100%) rename examples/notebooks/pdf-processing-1/input/{lorem.pdf => lorem-ipsum.pdf} (100%) diff --git a/examples/notebooks/pdf-processing-1/README.md b/examples/notebooks/pdf-processing-1/README.md index a611183e9..84d8e15c6 100644 --- a/examples/notebooks/pdf-processing-1/README.md +++ b/examples/notebooks/pdf-processing-1/README.md @@ -43,5 +43,5 @@ pandoc earth.md -o earth.pdf pandoc earth2.md -o earth2.pdf pandoc mars.md -o mars.pdf pandoc spam.md -o spam.pdf -pandoc lorem.md -o lorem.pdf +pandoc lorem-ipsum.md -o lorem-ipsum.pdf ``` \ No newline at end of file diff --git a/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw index c0525c556..03b19ce3c 100644 --- a/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw +++ b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.excalidraw @@ -5,44 +5,8 @@ "elements": [ { "type": "image", - "version": 128, - "versionNonce": 146671843, - "index": "b45", - "isDeleted": false, - "id": "nQdFTOsh8Rjwn3poFcnOO", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 258.1818181818182, - "y": 213.63636363636363, - "strokeColor": "transparent", - "backgroundColor": "transparent", - "width": 64, - "height": 64, - "seed": 222183398, - "groupIds": [ - "4aSnKsxGoqeqA7eYu4s2e" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726186954844, - "link": null, - "locked": false, - "status": "saved", - "fileId": "83ba3062a1490699e3ccc129acb25b1f4ec5534d", - "scale": [ - 1, - 1 - ] - }, - { - "type": "image", - "version": 240, - "versionNonce": 2054222979, + "version": 457, + "versionNonce": 173110248, "index": "b46", "isDeleted": false, "id": "hlPJZs7lUbLYhuRbSmYHs", @@ -52,29 +16,23 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 260.90909090909093, - "y": 285.4545454545455, + "x": 194.90909090909093, + "y": 202.4545454545455, "strokeColor": "transparent", "backgroundColor": "transparent", "width": 64, "height": 64, "seed": 961787386, - "groupIds": [ - "4aSnKsxGoqeqA7eYu4s2e" - ], + "groupIds": [], "frameId": null, "roundness": null, "boundElements": [ { "id": "FVhCmDYbWjGck9rgcESwp", "type": "arrow" - }, - { - "id": "JMprrs8mNVD4CrqUlVm7i", - "type": "arrow" } ], - "updated": 1726186954844, + "updated": 1737528573258, "link": null, "locked": false, "status": "saved", @@ -82,12 +40,13 @@ "scale": [ 1, 1 - ] + ], + "crop": null }, { "type": "arrow", - "version": 2550, - "versionNonce": 1240871476, + "version": 2976, + "versionNonce": 1926996376, "index": "b47", "isDeleted": false, "id": "FVhCmDYbWjGck9rgcESwp", @@ -97,12 +56,12 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 823.5583207607388, - "y": 273.73602641681657, + "x": 583.0728843528818, + "y": 265.0654681139756, "strokeColor": "#2f9e44", "backgroundColor": "transparent", - "width": 154.2895204048931, - "height": 2.3372664247598323, + "width": 221.74126076768994, + "height": 0.598117686721821, "seed": 1954615226, "groupIds": [], "frameId": null, @@ -110,16 +69,21 @@ "type": 2 }, "boundElements": [], - "updated": 1726708776348, + "updated": 1737528696232, "link": null, "locked": false, "startBinding": { - "elementId": "Wxv71stEiYRpNjyhzzXgO", - "focus": 1.202109076005182, - "gap": 9.103775306193256, + "elementId": "YFlD_rDw6IwCctPG9BjYf", + "focus": 0.841290319837998, + "gap": 12.052870784360664, + "fixedPoint": null + }, + "endBinding": { + "elementId": "DolT9H5aqzEugA7sUfNlx", + "focus": -0.14468495613909563, + "gap": 10.4071488270705, "fixedPoint": null }, - "endBinding": null, "lastCommittedPoint": null, "startArrowhead": null, "endArrowhead": "arrow", @@ -129,61 +93,15 @@ 0 ], [ - 154.2895204048931, - 2.3372664247598323 + 221.74126076768994, + -0.598117686721821 ] ] }, - { - "type": "text", - "version": 324, - "versionNonce": 1281521869, - "index": "b4M", - "isDeleted": false, - "id": "zSJvmm-7DrsR5-qRb96Kl", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 595.4118679291607, - "y": 242.27481706603328, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 141.51840079198635, - "height": 59.453152259008114, - "seed": 409665722, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [ - { - "id": "JMprrs8mNVD4CrqUlVm7i", - "type": "arrow" - }, - { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" - } - ], - "updated": 1726186894805, - "link": null, - "locked": false, - "fontSize": 23.781260903603247, - "fontFamily": 1, - "text": "2. split into\nchunks", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "2. split into\nchunks", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "arrow", - "version": 848, - "versionNonce": 138401069, + "version": 1191, + "versionNonce": 1753926120, "index": "b4N", "isDeleted": false, "id": "JMprrs8mNVD4CrqUlVm7i", @@ -193,12 +111,12 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 329.1268602850381, - "y": 278.24885892455757, + "x": 303.3582097473162, + "y": 267.24885892455757, "strokeColor": "#2f9e44", "backgroundColor": "#b2f2bb", - "width": 185.2530890548909, - "height": 2.823455039174007, + "width": 198.02173959261273, + "height": 2.6228850442226985, "seed": 1319994682, "groupIds": [], "frameId": null, @@ -206,19 +124,19 @@ "type": 2 }, "boundElements": [], - "updated": 1726186962183, + "updated": 1737528662023, "link": null, "locked": false, "startBinding": { - "elementId": "hlPJZs7lUbLYhuRbSmYHs", - "focus": -1.189794049219074, - "gap": 7.205686529987929, + "elementId": "QSiEFZIoz081ipwdmU8sg", + "focus": 0.36390758833591985, + "gap": 4.736856944692818, "fixedPoint": null }, "endBinding": { "elementId": "YFlD_rDw6IwCctPG9BjYf", - "focus": 1.1403432588201572, - "gap": 6.460959750980123, + "focus": -0.7972060339621995, + "gap": 9.46095975098018, "fixedPoint": null }, "lastCommittedPoint": null, @@ -230,15 +148,15 @@ 0 ], [ - 185.2530890548909, - -2.823455039174007 + 198.02173959261273, + -2.6228850442226985 ] ] }, { "type": "text", - "version": 757, - "versionNonce": 361576332, + "version": 865, + "versionNonce": 1985915368, "index": "b4O", "isDeleted": false, "id": "G0k27V_VE7lyh7YGr_fts", @@ -248,11 +166,11 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 1128.9917648038, - "y": 212.9780740734803, + "x": 934.9917648037998, + "y": 247.9780740734803, "strokeColor": "#1e1e1e", "backgroundColor": "#b2f2bb", - "width": 110.85037231445312, + "width": 100.90922546386719, "height": 58.225670034857664, "seed": 970452474, "groupIds": [], @@ -264,23 +182,23 @@ "type": "arrow" } ], - "updated": 1726708803406, + "updated": 1737528832732, "link": null, "locked": false, "fontSize": 23.290268013943066, "fontFamily": 1, - "text": "4. dedupe\n(exact)", + "text": "3. exact\ndedupe", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "4. dedupe\n(exact)", + "originalText": "3. exact\ndedupe", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 598, - "versionNonce": 1689279715, + "version": 614, + "versionNonce": 181505944, "index": "b4g", "isDeleted": false, "id": "XUbC5cWQCm-GEFrdqZW7g", @@ -290,8 +208,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 333.94038113680745, - "y": 243.15978750685963, + "x": 319.94038113680745, + "y": 233.15978750685963, "strokeColor": "#1e1e1e", "backgroundColor": "#ffc9c9", "width": 173.54608154296875, @@ -306,7 +224,7 @@ "type": "arrow" } ], - "updated": 1726187078639, + "updated": 1737528653755, "link": null, "locked": false, "fontSize": 22.766190549743982, @@ -319,183 +237,10 @@ "autoResize": true, "lineHeight": 1.25 }, - { - "type": "image", - "version": 145, - "versionNonce": 1461008621, - "index": "b4h", - "isDeleted": false, - "id": "XH-Rt0Q5-K2g4tM9reh76", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 520.8409090909091, - "y": 209.88636363636368, - "strokeColor": "transparent", - "backgroundColor": "transparent", - "width": 64, - "height": 64, - "seed": 1159948140, - "groupIds": [ - "KKvJ56bTHwzAbN8YXYU0-" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726186894805, - "link": null, - "locked": false, - "status": "saved", - "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", - "scale": [ - 1, - 1 - ] - }, - { - "type": "image", - "version": 193, - "versionNonce": 1127846733, - "index": "b4i", - "isDeleted": false, - "id": "YFlD_rDw6IwCctPG9BjYf", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 520.8409090909091, - "y": 279.8863636363637, - "strokeColor": "transparent", - "backgroundColor": "transparent", - "width": 64, - "height": 64, - "seed": 1369151980, - "groupIds": [ - "KKvJ56bTHwzAbN8YXYU0-" - ], - "frameId": null, - "roundness": null, - "boundElements": [ - { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" - }, - { - "id": "JMprrs8mNVD4CrqUlVm7i", - "type": "arrow" - } - ], - "updated": 1726186894805, - "link": null, - "locked": false, - "status": "saved", - "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", - "scale": [ - 1, - 1 - ] - }, - { - "type": "arrow", - "version": 753, - "versionNonce": 1832909987, - "index": "b4j", - "isDeleted": false, - "id": "0wYqjwjKHCGbx7CfmDR__", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 0, - "opacity": 100, - "angle": 0, - "x": 587.6995151292258, - "y": 276.08728311464677, - "strokeColor": "#2f9e44", - "backgroundColor": "#b2f2bb", - "width": 160.10395921482052, - "height": 0.6238794650969908, - "seed": 1397245780, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726186894829, - "link": null, - "locked": false, - "startBinding": { - "elementId": "YFlD_rDw6IwCctPG9BjYf", - "focus": -1.1101505124640194, - "gap": 3.799080521716917, - "fixedPoint": null - }, - "endBinding": { - "elementId": "zSJvmm-7DrsR5-qRb96Kl", - "focus": -0.1259939432648205, - "gap": 10.873205622899263, - "fixedPoint": null - }, - "lastCommittedPoint": null, - "startArrowhead": null, - "endArrowhead": "arrow", - "points": [ - [ - 0, - 0 - ], - [ - 160.10395921482052, - -0.6238794650969908 - ] - ] - }, - { - "type": "text", - "version": 19, - "versionNonce": 1725165603, - "index": "b4t", - "isDeleted": false, - "id": "56KAsZE3Fub50OzL9XJ35", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 344.7055268721148, - "y": 290.01136363636374, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 137.6798553466797, - "height": 25, - "seed": 961622755, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726187031887, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "(pdf2parquet)", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "(pdf2parquet)", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "text", - "version": 89, - "versionNonce": 1217800429, + "version": 132, + "versionNonce": 1504935576, "index": "b4u", "isDeleted": false, "id": "GEwyTqhl4LrSwcaOeKRT5", @@ -505,71 +250,34 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 514.7055268721148, - "y": 356.01136363636374, + "x": 518.7055268721148, + "y": 383.01136363636374, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 74.97993469238281, + "width": 92.63992309570312, "height": 50, "seed": 31755757, "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726187172155, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "parquet\nfiles", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "parquet\nfiles", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "text", - "version": 273, - "versionNonce": 821721012, - "index": "b5F", - "isDeleted": false, - "id": "ZGkHBN9UBrJLYPIlm-KTj", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1355.555487199263, - "y": 305.51136363636374, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 118.5198974609375, - "height": 50, - "seed": 1591407981, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708923087, + "updated": 1737528618509, "link": null, "locked": false, "fontSize": 20, "fontFamily": 5, - "text": "duplicate 'B'\nis removed", + "text": "markdown\ntext", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "duplicate 'B'\nis removed", + "originalText": "markdown\ntext", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 747, - "versionNonce": 104645940, + "version": 804, + "versionNonce": 859000296, "index": "b5G", "isDeleted": false, "id": "DolT9H5aqzEugA7sUfNlx", @@ -579,34 +287,39 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 827.643003983931, - "y": 226.3985286189349, + "x": 596.643003983931, + "y": 231.3985286189349, "strokeColor": "#1e1e1e", "backgroundColor": "#b2f2bb", - "width": 166.41502380371094, - "height": 29.112835017428832, + "width": 197.7639923095703, + "height": 58.225670034857664, "seed": 466678605, "groupIds": [], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708795102, + "boundElements": [ + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + } + ], + "updated": 1737528686607, "link": null, "locked": false, "fontSize": 23.290268013943066, "fontFamily": 1, - "text": "3. document id", + "text": "2. document id\n(compute hashes)", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "3. document id", + "originalText": "2. document id\n(compute hashes)", "autoResize": true, "lineHeight": 1.25 }, { "type": "arrow", - "version": 1071, - "versionNonce": 474965812, + "version": 1254, + "versionNonce": 980324072, "index": "b5U", "isDeleted": false, "id": "cXhTkxU13WdQeAv3Z_1mR", @@ -616,12 +329,12 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 1318.993474938044, - "y": 401.3233033689122, + "x": 1145.993474938044, + "y": 268.31133050044286, "strokeColor": "#2f9e44", "backgroundColor": "#b2f2bb", - "width": 0.8539592148204065, - "height": 113.62612053490295, + "width": 167.8539592148204, + "height": 1.6380934033722951, "seed": 605419139, "groupIds": [], "frameId": null, @@ -629,11 +342,21 @@ "type": 2 }, "boundElements": [], - "updated": 1726709016812, + "updated": 1737528943852, "link": null, "locked": false, - "startBinding": null, - "endBinding": null, + "startBinding": { + "elementId": "Qaz1byDgzm-0ZrVLBmU4v", + "focus": -0.37744699407794313, + "gap": 8.76620221077144, + "fixedPoint": null + }, + "endBinding": { + "elementId": "LbPBuhQ2btuEnjbeSDvuK", + "focus": -2.1413835587747667, + "gap": 14.33294663108768, + "fixedPoint": null + }, "lastCommittedPoint": null, "startArrowhead": null, "endArrowhead": "arrow", @@ -643,15 +366,15 @@ 0 ], [ - 0.8539592148204065, - 113.62612053490295 + 167.8539592148204, + 1.6380934033722951 ] ] }, { "type": "text", - "version": 976, - "versionNonce": 988237964, + "version": 1037, + "versionNonce": 1974786200, "index": "b5V", "isDeleted": false, "id": "Ba_pxAykcwH_ZsTbAtduc", @@ -661,34 +384,34 @@ "roughness": 0, "opacity": 100, "angle": 0, - "x": 1218.815207047896, - "y": 429.9549461276493, + "x": 1160.815207047896, + "y": 234.9549461276493, "strokeColor": "#1e1e1e", "backgroundColor": "#b2f2bb", - "width": 184.07017517089844, - "height": 29.112835017428832, + "width": 98.09219360351562, + "height": 58.225670034857664, "seed": 1665190893, "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726709020882, + "updated": 1737528881336, "link": null, "locked": false, "fontSize": 23.290268013943066, "fontFamily": 1, - "text": "5. fuzzy dedupe", + "text": "4. fuzzy\ndedupe", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "5. fuzzy dedupe", + "originalText": "4. fuzzy\ndedupe", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 580, - "versionNonce": 693951668, + "version": 677, + "versionNonce": 1394703256, "index": "b5h", "isDeleted": false, "id": "XFHbtP2KmiHNNjZhz8ajW", @@ -698,8 +421,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1299.1022727272725, - "y": 517.40625, + "x": 1334.1022727272725, + "y": 178.40625, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, @@ -718,14 +441,14 @@ "id": "OdGsWefGyr6uqMl0wC6mH" } ], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false }, { "type": "text", - "version": 323, - "versionNonce": 1216816692, + "version": 420, + "versionNonce": 2107525272, "index": "b5i", "isDeleted": false, "id": "OdGsWefGyr6uqMl0wC6mH", @@ -735,8 +458,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1315.9786418568, - "y": 522.40625, + "x": 1350.9786418568, + "y": 183.40625, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 13.519989013671875, @@ -748,7 +471,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -763,8 +486,8 @@ }, { "type": "rectangle", - "version": 573, - "versionNonce": 1856782260, + "version": 677, + "versionNonce": 1612348312, "index": "b5j", "isDeleted": false, "id": "NzWqph0M7tEkeTDKLPGZR", @@ -774,8 +497,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1301.1931818181815, - "y": 564.5880681818182, + "x": 1336.1931818181815, + "y": 225.58806818181824, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, @@ -792,16 +515,20 @@ { "type": "text", "id": "K1QK2dyVWiWfd32P8ovQK" + }, + { + "id": "-CNAjEmW6cbufb2V3aXbb", + "type": "arrow" } ], - "updated": 1726708989657, + "updated": 1737530583902, "link": null, "locked": false }, { "type": "text", - "version": 264, - "versionNonce": 334637364, + "version": 364, + "versionNonce": 150023400, "index": "b5k", "isDeleted": false, "id": "K1QK2dyVWiWfd32P8ovQK", @@ -811,11 +538,11 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1317.219552473588, - "y": 569.5880681818182, + "x": 1351.329545454545, + "y": 230.58806818181824, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 17, "height": 25, "seed": 1350557773, "groupIds": [ @@ -824,7 +551,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737530583904, "link": null, "locked": false, "fontSize": 20, @@ -839,8 +566,8 @@ }, { "type": "rectangle", - "version": 680, - "versionNonce": 1002365620, + "version": 777, + "versionNonce": 1889202072, "index": "b5l", "isDeleted": false, "id": "Lf5-FqrnO7iDVhOKUtEnT", @@ -850,8 +577,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1306.9204545454545, - "y": 619.3267045454547, + "x": 1341.9204545454545, + "y": 280.32670454545473, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, @@ -870,14 +597,14 @@ "id": "cTJ-8HZCMcNbXqDHggxAH" } ], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false }, { "type": "text", - "version": 375, - "versionNonce": 213412916, + "version": 472, + "versionNonce": 331955352, "index": "b5m", "isDeleted": false, "id": "cTJ-8HZCMcNbXqDHggxAH", @@ -887,8 +614,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1324.2668248956852, - "y": 624.3267045454547, + "x": 1359.2668248956852, + "y": 285.32670454545473, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 12.579986572265625, @@ -900,7 +627,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -915,8 +642,8 @@ }, { "type": "text", - "version": 141, - "versionNonce": 1757726132, + "version": 238, + "versionNonce": 900065688, "index": "b5n", "isDeleted": false, "id": "LK6nmMo09HhGvAeViRfcK", @@ -926,8 +653,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1274.397727272727, - "y": 523.3664772727274, + "x": 1309.397727272727, + "y": 184.36647727272737, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 12, @@ -939,7 +666,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -954,8 +681,8 @@ }, { "type": "text", - "version": 196, - "versionNonce": 761917108, + "version": 294, + "versionNonce": 1508025832, "index": "b5o", "isDeleted": false, "id": "LbPBuhQ2btuEnjbeSDvuK", @@ -965,8 +692,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1278.397727272727, - "y": 569.6164772727275, + "x": 1313.397727272727, + "y": 230.61647727272748, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, @@ -977,8 +704,13 @@ ], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708993287, + "boundElements": [ + { + "id": "cXhTkxU13WdQeAv3Z_1mR", + "type": "arrow" + } + ], + "updated": 1737528943380, "link": null, "locked": false, "fontSize": 20, @@ -993,8 +725,8 @@ }, { "type": "text", - "version": 385, - "versionNonce": 800257204, + "version": 484, + "versionNonce": 1538941848, "index": "b5p", "isDeleted": false, "id": "tEnh5H4Dm1tA62FJY7ZnT", @@ -1004,8 +736,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1279.647727272727, - "y": 629.6164772727275, + "x": 1314.647727272727, + "y": 290.6164772727275, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, @@ -1017,7 +749,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726709003336, + "updated": 1737528940801, "link": null, "locked": false, "fontSize": 20, @@ -1032,8 +764,8 @@ }, { "type": "text", - "version": 307, - "versionNonce": 51819060, + "version": 406, + "versionNonce": 313505768, "index": "b5q", "isDeleted": false, "id": "TExMhRi4612k0BcybcpHE", @@ -1043,8 +775,8 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": 1251.2855058149858, - "y": 678.5113636363637, + "x": 1286.2855058149858, + "y": 339.51136363636374, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 143.59986877441406, @@ -1056,7 +788,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708989657, + "updated": 1737530582726, "link": null, "locked": false, "fontSize": 20, @@ -1069,243 +801,28 @@ "autoResize": true, "lineHeight": 1.25 }, - { - "type": "arrow", - "version": 1039, - "versionNonce": 199529869, - "index": "b5r", - "isDeleted": false, - "id": "KvvwHoDnDT0vBh2bOfiTz", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 0, - "opacity": 100, - "angle": 0, - "x": 1245.243474938044, - "y": 579.5733033689121, - "strokeColor": "#2f9e44", - "backgroundColor": "#b2f2bb", - "width": 192.8960407851796, - "height": 1.126120534903066, - "seed": 1004556899, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188444758, - "link": null, - "locked": false, - "startBinding": null, - "endBinding": null, - "lastCommittedPoint": null, - "startArrowhead": null, - "endArrowhead": "arrow", - "points": [ - [ - 0, - 0 - ], - [ - -192.8960407851796, - 1.126120534903066 - ] - ] - }, - { - "type": "text", - "version": 989, - "versionNonce": 923042467, - "index": "b5s", - "isDeleted": false, - "id": "cPSHqIr9Peb5h5TNxl3Bb", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 0, - "opacity": 100, - "angle": 0, - "x": 1100.5103669600053, - "y": 536.2049461276495, - "strokeColor": "#1e1e1e", - "backgroundColor": "#b2f2bb", - "width": 138.99639892578125, - "height": 29.112835017428832, - "seed": 865272429, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726188447614, - "link": null, - "locked": false, - "fontSize": 23.290268013943066, - "fontFamily": 1, - "text": "6. vectorize", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "6. vectorize", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "diamond", - "version": 103, - "versionNonce": 679668419, - "index": "b5vV", - "isDeleted": false, - "id": "tPvUjMUp7lW3F8V3H2MGV", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 960.0454545454546, - "y": 515.5113636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 63.75, - "height": 45, - "seed": 782762477, - "groupIds": [ - "CuM_sg3LC9KTYRVST18pX" - ], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188516836, - "link": null, - "locked": false - }, - { - "type": "diamond", - "version": 117, - "versionNonce": 224511779, - "index": "b5w", - "isDeleted": false, - "id": "uOIVUAj_hGKNiZ3NnQm2n", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 961.9204545454546, - "y": 564.5113636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 63.75, - "height": 45, - "seed": 1245990083, - "groupIds": [ - "CuM_sg3LC9KTYRVST18pX" - ], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188516836, - "link": null, - "locked": false - }, - { - "type": "diamond", - "version": 122, - "versionNonce": 1205596301, - "index": "b5x", - "isDeleted": false, - "id": "ylh6O0GmjhRAHndHyuEo2", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 966.9204545454546, - "y": 615.7613636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 63.75, - "height": 45, - "seed": 499397773, - "groupIds": [ - "CuM_sg3LC9KTYRVST18pX" - ], - "frameId": null, - "roundness": { - "type": 2 - }, - "boundElements": [], - "updated": 1726188516836, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 260, - "versionNonce": 1136192621, - "index": "b5y", - "isDeleted": false, - "id": "ekXIjXxtZ6f2w_A-9CVUV", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 938.2855058149859, - "y": 670.7613636363637, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 107.5399169921875, - "height": 25, - "seed": 1616985635, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726188507123, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "embeddings", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "embeddings", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "rectangle", - "version": 381, - "versionNonce": 1618061620, - "index": "b5z", + "version": 589, + "versionNonce": 1049638120, + "index": "b698", "isDeleted": false, - "id": "Uv-8TiLeECJuuNx1yJjtv", + "id": "JNHVvikjirDDllCKotbJC", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 768.5454545454545, - "y": 280.72727272727275, + "x": 844.9545454545454, + "y": 249.68750000000006, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 637818278, + "seed": 848769955, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { @@ -1313,45 +830,45 @@ }, "boundElements": [ { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" + "type": "text", + "id": "8Msc7tXcZdg2UUH2NmUn-" }, { - "type": "text", - "id": "B8Nj-HzRDl-FA-5UJ2hiw" + "id": "M_WCuesgPRdSQ_zqaUtz0", + "type": "arrow" } ], - "updated": 1726708776347, + "updated": 1737528714494, "link": null, "locked": false }, { "type": "text", - "version": 140, - "versionNonce": 1472181260, - "index": "b60", + "version": 348, + "versionNonce": 1968921752, + "index": "b69G", "isDeleted": false, - "id": "B8Nj-HzRDl-FA-5UJ2hiw", + "id": "8Msc7tXcZdg2UUH2NmUn-", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 783.2418233698064, - "y": 285.72727272727275, + "x": 859.6509142788972, + "y": 254.68750000000006, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 17.879989624023438, "height": 25, - "seed": 1971906541, + "seed": 1297532739, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, @@ -1359,33 +876,33 @@ "text": "A'", "textAlign": "center", "verticalAlign": "middle", - "containerId": "Uv-8TiLeECJuuNx1yJjtv", + "containerId": "JNHVvikjirDDllCKotbJC", "originalText": "A'", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 391, - "versionNonce": 1280205492, - "index": "b61", + "version": 626, + "versionNonce": 1609828760, + "index": "b69O", "isDeleted": false, - "id": "l7XMM15Xwzq5xmDF0QvyN", + "id": "fkbHGW5tJ-Ay0sh8h-9hJ", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 764.090909090909, - "y": 186.09090909090912, + "x": 841.4999999999999, + "y": 156.05113636363643, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1556091898, + "seed": 2116216547, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { @@ -1394,40 +911,40 @@ "boundElements": [ { "type": "text", - "id": "SZp9x_uNQ-65LQPMQ768C" + "id": "BNiP4zX7PtFTn_e_5vXX3" } ], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false }, { "type": "text", - "version": 132, - "versionNonce": 809849484, - "index": "b62", + "version": 369, + "versionNonce": 753866392, + "index": "b69V", "isDeleted": false, - "id": "SZp9x_uNQ-65LQPMQ768C", + "id": "BNiP4zX7PtFTn_e_5vXX3", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 780.9672782204367, - "y": 191.09090909090912, + "x": 858.3763691295275, + "y": 161.05113636363643, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 13.519989013671875, "height": 25, - "seed": 912377443, + "seed": 1804210819, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, @@ -1435,83 +952,75 @@ "text": "A", "textAlign": "center", "verticalAlign": "middle", - "containerId": "l7XMM15Xwzq5xmDF0QvyN", + "containerId": "fkbHGW5tJ-Ay0sh8h-9hJ", "originalText": "A", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 413, - "versionNonce": 1599597620, - "index": "b63", + "version": 619, + "versionNonce": 553681816, + "index": "b69d", "isDeleted": false, - "id": "Wxv71stEiYRpNjyhzzXgO", + "id": "QYKbNgibs7-HxaNNr8tfG", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 767.1818181818182, - "y": 234.27272727272725, + "x": 843.5909090909089, + "y": 203.23295454545456, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 775085434, + "seed": 1716177443, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { "type": 3 }, "boundElements": [ - { - "id": "0wYqjwjKHCGbx7CfmDR__", - "type": "arrow" - }, - { - "id": "FVhCmDYbWjGck9rgcESwp", - "type": "arrow" - }, { "type": "text", - "id": "zyU1230-bmsHaQTSoi7Ov" + "id": "C-rwFmAbwI_qgVqpkXy7m" } ], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false }, { "type": "text", - "version": 102, - "versionNonce": 1402151180, - "index": "b64", + "version": 310, + "versionNonce": 1247563928, + "index": "b69l", "isDeleted": false, - "id": "zyU1230-bmsHaQTSoi7Ov", + "id": "C-rwFmAbwI_qgVqpkXy7m", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 783.2081888372248, - "y": 239.27272727272725, + "x": 859.6172797463154, + "y": 208.23295454545456, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", "width": 15.219985961914062, "height": 25, - "seed": 1842733667, + "seed": 592678339, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, @@ -1519,33 +1028,33 @@ "text": "B", "textAlign": "center", "verticalAlign": "middle", - "containerId": "Wxv71stEiYRpNjyhzzXgO", + "containerId": "QYKbNgibs7-HxaNNr8tfG", "originalText": "B", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 397, - "versionNonce": 997475764, - "index": "b65", + "version": 714, + "versionNonce": 1354136984, + "index": "b69t", "isDeleted": false, - "id": "IkaeA2i4mlTdmulYEI_na", + "id": "m2Wj9fp76PKCAhrulCmTa", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 771.3636363636363, - "y": 325.3636363636364, + "x": 846.3181818181819, + "y": 339.97159090909105, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1839286010, + "seed": 901963107, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": { @@ -1554,265 +1063,1493 @@ "boundElements": [ { "type": "text", - "id": "IgKDOIQhfqb_x9gQh30eh" + "id": "MNgTOO1UYazXucNSjXZ_z" } ], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false }, { "type": "text", - "version": 89, - "versionNonce": 421732236, - "index": "b66", + "version": 409, + "versionNonce": 1162021528, + "index": "b6A", "isDeleted": false, - "id": "IgKDOIQhfqb_x9gQh30eh", + "id": "MNgTOO1UYazXucNSjXZ_z", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 787.3900070190429, - "y": 330.3636363636364, + "x": 863.6645521684126, + "y": 344.97159090909105, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 12.579986572265625, "height": 25, - "seed": 1893385699, + "seed": 1223112963, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, "fontFamily": 5, - "text": "B", + "text": "C", "textAlign": "center", "verticalAlign": "middle", - "containerId": "IkaeA2i4mlTdmulYEI_na", - "originalText": "B", + "containerId": "m2Wj9fp76PKCAhrulCmTa", + "originalText": "C", "autoResize": true, "lineHeight": 1.25 }, { - "type": "rectangle", - "version": 440, - "versionNonce": 1439264564, - "index": "b67", + "type": "text", + "version": 188, + "versionNonce": 1924528024, + "index": "b6AG", "isDeleted": false, - "id": "qGfihx9_lQSyc1F8oQTu0", + "id": "J1KVE_C00rdGo7FWIwu1X", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 772.909090909091, - "y": 369.01136363636374, + "x": 817.7954545454544, + "y": 162.01136363636374, "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 1381062179, + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 1442121325, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "0DIl-np94wHje4sIubFJp" - } - ], - "updated": 1726708776347, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, "link": null, - "locked": false + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 }, { "type": "text", - "version": 133, - "versionNonce": 1496272396, - "index": "b68", + "version": 242, + "versionNonce": 759383192, + "index": "b6AV", "isDeleted": false, - "id": "0DIl-np94wHje4sIubFJp", + "id": "TIEDsM4QhNNDJARAJnvDz", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 790.2554612593218, - "y": 374.01136363636374, - "strokeColor": "#1e1e1e", + "x": 820.7954545454544, + "y": 208.26136363636374, + "strokeColor": "#e03131", "backgroundColor": "transparent", - "width": 12.579986572265625, + "width": 11, "height": 25, - "seed": 1722325443, + "seed": 846611715, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "ssihZCwGeFNCQehvjAg06" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708776347, + "updated": 1737528708101, "link": null, "locked": false, "fontSize": 20, - "fontFamily": 5, - "text": "C", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "qGfihx9_lQSyc1F8oQTu0", - "originalText": "C", - "autoResize": true, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 290, + "versionNonce": 580841880, + "index": "b6Al", + "isDeleted": false, + "id": "tGvqUuD_kCzfMYn-UX8o-", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 823.2954545454544, + "y": 257.01136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 758667053, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "3", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 421, + "versionNonce": 704446104, + "index": "b6B", + "isDeleted": false, + "id": "IQM8OVr381UGBDKQtda8U", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 823.0454545454544, + "y": 345.26136363636374, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 618433805, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 672, + "versionNonce": 336685976, + "index": "b6BV", + "isDeleted": false, + "id": "fJGd6Pf-SaTmbDMUGHhUW", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 847.3972327492455, + "y": 296.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1491526540, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Ax-8fSsrXvrkMhlGAgJgO" + } + ], + "updated": 1737528708101, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 363, + "versionNonce": 2064660632, + "index": "b6C", + "isDeleted": false, + "id": "Ax-8fSsrXvrkMhlGAgJgO", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 863.423603404652, + "y": 301.2812500000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 1943704076, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "fJGd6Pf-SaTmbDMUGHhUW", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 320, + "versionNonce": 313353624, + "index": "b6CV", + "isDeleted": false, + "id": "07qZABiLS71UbigBsFpnK", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 821.033596385609, + "y": 301.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1965424820, + "groupIds": [ + "ssihZCwGeFNCQehvjAg06" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528708101, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "4", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "4", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "arrow", + "version": 2745, + "versionNonce": 1420536808, + "index": "b6D", + "isDeleted": false, + "id": "M_WCuesgPRdSQ_zqaUtz0", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 905.532130562785, + "y": 274.97561555378826, + "strokeColor": "#2f9e44", + "backgroundColor": "transparent", + "width": 162.00146582282412, + "height": 0.6286347709357187, + "seed": 1489010356, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "startBinding": { + "elementId": "JNHVvikjirDDllCKotbJC", + "focus": 0.4403861575576877, + "gap": 13.304857835512394, + "fixedPoint": null + }, + "endBinding": { + "elementId": "NxUqy-MsYDga_9XDrU9l7", + "focus": -0.04300532190875777, + "gap": 1, + "fixedPoint": null + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + 162.00146582282412, + -0.6286347709357187 + ] + ] + }, + { + "type": "text", + "version": 311, + "versionNonce": 212346088, + "index": "b6D8", + "isDeleted": false, + "id": "ZGkHBN9UBrJLYPIlm-KTj", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1062.555487199263, + "y": 410.51136363636374, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 118.5198974609375, + "height": 50, + "seed": 1591407981, + "groupIds": [ + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897882, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "duplicate 'B'\nis removed", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "duplicate 'B'\nis removed", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 285, + "versionNonce": 1763919848, + "index": "b6DG", + "isDeleted": false, + "id": "wkavhEPwz2TNGwf8xFeLA", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1065.0335963856091, + "y": 172.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 809955212, + "groupIds": [ + "uHtPh4-PiLJtgc-p_Cdgo", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897882, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "1", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "1", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 653, + "versionNonce": 1883376360, + "index": "b6DO", + "isDeleted": false, + "id": "Qaz1byDgzm-0ZrVLBmU4v", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1089.9545454545455, + "y": 257.1875000000001, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 144156909, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "D2HbgzHXdGyxGppwaWbBy" + }, + { + "id": "cXhTkxU13WdQeAv3Z_1mR", + "type": "arrow" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 410, + "versionNonce": 1998221544, + "index": "b6DV", + "isDeleted": false, + "id": "D2HbgzHXdGyxGppwaWbBy", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1104.6509142788973, + "y": 262.1875000000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 17.879989624023438, + "height": 25, + "seed": 2062418765, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Qaz1byDgzm-0ZrVLBmU4v", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 683, + "versionNonce": 1735136232, + "index": "b6Dd", + "isDeleted": false, + "id": "-LxVJeZLqj0MgI5FEg_pm", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1082.5, + "y": 163.55113636363643, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1514803629, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "trFDjiJr6cfNlCSEKqNjE" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 425, + "versionNonce": 1133598440, + "index": "b6Dl", + "isDeleted": false, + "id": "trFDjiJr6cfNlCSEKqNjE", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1099.3763691295276, + "y": 168.55113636363643, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 13.519989013671875, + "height": 25, + "seed": 1674925069, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "-LxVJeZLqj0MgI5FEg_pm", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 680, + "versionNonce": 269892072, + "index": "b6E", + "isDeleted": false, + "id": "Kxu9owye4gMpRvh7kJ1Nl", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1088.590909090909, + "y": 210.73295454545456, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 1938377325, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "UP92rSYiIXnnBFhov6WNx" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 370, + "versionNonce": 1611054312, + "index": "b6EG", + "isDeleted": false, + "id": "UP92rSYiIXnnBFhov6WNx", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1104.6172797463157, + "y": 215.73295454545456, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 15.219985961914062, + "height": 25, + "seed": 707753165, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Kxu9owye4gMpRvh7kJ1Nl", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 707, + "versionNonce": 82763752, + "index": "b6EV", + "isDeleted": false, + "id": "KMOsOR4pOx-ute2ztnw1k", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1094.318181818182, + "y": 345.4715909090911, + "strokeColor": "#e03131", + "backgroundColor": "#ffc9c9", + "width": 47.27272727272725, + "height": 35, + "seed": 635317229, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "SsRO-f6mzQzf5jQOudz6C" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 401, + "versionNonce": 1054515944, + "index": "b6El", + "isDeleted": false, + "id": "SsRO-f6mzQzf5jQOudz6C", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1111.6645521684127, + "y": 350.4715909090911, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 12.579986572265625, + "height": 25, + "seed": 1382819405, + "groupIds": [ + "bDrNCHlMlNcEbIn9yZXly", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "KMOsOR4pOx-ute2ztnw1k", + "originalText": "C", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 319, + "versionNonce": 1817576936, + "index": "b6F", + "isDeleted": false, + "id": "US1PK13ekocRlMvOrHSJL", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1066.0335963856091, + "y": 215.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 1525760780, + "groupIds": [ + "bQ__H1TgpJXskAm32UBLZ", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "2", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "2", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 357, + "versionNonce": 980224232, + "index": "b6FV", + "isDeleted": false, + "id": "NxUqy-MsYDga_9XDrU9l7", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1068.5335963856091, + "y": 261.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 12, + "height": 25, + "seed": 1116920372, + "groupIds": [ + "4mN8vM1PMjtKHfzWdqXES", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [ + { + "id": "M_WCuesgPRdSQ_zqaUtz0", + "type": "arrow" + } + ], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "3", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "3", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 353, + "versionNonce": 354283240, + "index": "b6G", + "isDeleted": false, + "id": "lSEPKkiY8if2M9pDun8DS", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1071.5335963856091, + "y": 354.2812500000001, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "width": 11, + "height": 25, + "seed": 932194828, + "groupIds": [ + "Z8bVLPerSCYHViV4Ld1Ed", + "XEHMHITFJTjudNYgVFCPu", + "vyfIXhnJpss6uiuzFKps6", + "UUMeFgK8RcVkGIGDsRBi8" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528897883, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 8, + "text": "5", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "text", + "version": 145, + "versionNonce": 56362904, + "index": "b6Q", + "isDeleted": false, + "id": "9Bwc8DwyPnrOxUQpApvfU", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 257.30863987315786, + "y": 383.5312500000001, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "width": 103.71990966796875, + "height": 50, + "seed": 1385699816, + "groupIds": [], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528426042, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "PDF \ndocuments", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "PDF \ndocuments", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 494, + "versionNonce": 1068503272, + "index": "b6R", + "isDeleted": false, + "id": "QSiEFZIoz081ipwdmU8sg", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 251.34862552989614, + "y": 242.95738636363643, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 1529123224, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "_Z-rRn1k6dRs-cBIHwwQY" + }, + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1737528651437, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 265, + "versionNonce": 1790196968, + "index": "b6S", + "isDeleted": false, + "id": "_Z-rRn1k6dRs-cBIHwwQY", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 265.2249946594238, + "y": 247.95738636363643, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 19.519989013671875, + "height": 25, + "seed": 13541016, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A'", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "QSiEFZIoz081ipwdmU8sg", + "originalText": "A'", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 505, + "versionNonce": 48835560, + "index": "b6T", + "isDeleted": false, + "id": "3xE7duRO9Qq4Sc-G2OvNv", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 246.89408007535064, + "y": 148.3210227272728, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 1605307288, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Vb3hONt1wd7JHFzI3HmrQ" + } + ], + "updated": 1737528540117, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 262, + "versionNonce": 1551754904, + "index": "b6U", + "isDeleted": false, + "id": "Vb3hONt1wd7JHFzI3HmrQ", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 263.03044371171427, + "y": 153.3210227272728, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 15, + "height": 25, + "seed": 1106892952, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528540117, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "A", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "3xE7duRO9Qq4Sc-G2OvNv", + "originalText": "A", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 525, + "versionNonce": 225964696, + "index": "b6V", + "isDeleted": false, + "id": "ooV7vvmtMmdPRnQmMHBmf", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 249.98498916625965, + "y": 196.50284090909093, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 191038872, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "_rMbVkq-GLuJSkRWHvjkn" + } + ], + "updated": 1737528539700, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 227, + "versionNonce": 472392424, + "index": "b6W", + "isDeleted": false, + "id": "_rMbVkq-GLuJSkRWHvjkn", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 265.1213528026233, + "y": 201.50284090909093, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 17, + "height": 25, + "seed": 152998552, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "ooV7vvmtMmdPRnQmMHBmf", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 510, + "versionNonce": 768826600, + "index": "b6X", + "isDeleted": false, + "id": "JUjlPmSPagKyAA6ikwVcf", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 254.16680734807767, + "y": 287.59375000000006, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 1105231768, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "Tov62fM0_erGxbIhudlqt" + }, + { + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + } + ], + "updated": 1737528566266, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 214, + "versionNonce": 1140033000, + "index": "b6Y", + "isDeleted": false, + "id": "Tov62fM0_erGxbIhudlqt", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 269.3031709844413, + "y": 292.59375000000006, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 17, + "height": 25, + "seed": 1172098200, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "B", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "JUjlPmSPagKyAA6ikwVcf", + "originalText": "B", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "rectangle", + "version": 578, + "versionNonce": 1264463000, + "index": "b6Z", + "isDeleted": false, + "id": "4cU98zwq8Qi78OlWyES2s", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 255.71226189353263, + "y": 331.2414772727274, + "strokeColor": "#e03131", + "backgroundColor": "#b2f2bb", + "width": 47.27272727272725, + "height": 35, + "seed": 2127002008, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": { + "type": 3 + }, + "boundElements": [ + { + "type": "text", + "id": "hDWulD4JcLixt2n_PIyWF" + } + ], + "updated": 1737528539700, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 284, + "versionNonce": 1113229544, + "index": "b6a", + "isDeleted": false, + "id": "hDWulD4JcLixt2n_PIyWF", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 272.34862552989625, + "y": 336.2414772727274, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 14, + "height": 25, + "seed": 2144634520, + "groupIds": [ + "syqTr4z_spUvkhxRP2GMv" + ], + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1737528539700, + "link": null, + "locked": false, + "fontSize": 20, + "fontFamily": 5, + "text": "C", + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "4cU98zwq8Qi78OlWyES2s", + "originalText": "C", + "autoResize": true, "lineHeight": 1.25 }, { - "type": "text", - "version": 70, - "versionNonce": 247294132, - "index": "b69", + "type": "image", + "version": 295, + "versionNonce": 1682243816, + "index": "b6d", "isDeleted": false, - "id": "lkM4ke2d8E4KSisX5yE08", + "id": "XH-Rt0Q5-K2g4tM9reh76", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 762.5454545454546, - "y": 429.51136363636374, - "strokeColor": "#1e1e1e", - "backgroundColor": "#d0bfff", - "width": 64.55995178222656, - "height": 25, - "seed": 1905848653, + "x": 510.8409090909091, + "y": 143.88636363636368, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 60.17910447761194, + "height": 60.17910447761194, + "seed": 1159948140, "groupIds": [ - "wECUsJGvuBUaz0aXhNgT4" + "KGVjVuaPc35r3zwmLpo6p" ], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708776347, + "boundElements": [ + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" + } + ], + "updated": 1737528662022, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "chunks", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "chunks", - "autoResize": true, - "lineHeight": 1.25 + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ], + "crop": null }, { - "type": "rectangle", - "version": 527, - "versionNonce": 1269467404, - "index": "b698", + "type": "image", + "version": 344, + "versionNonce": 276052968, + "index": "b6e", "isDeleted": false, - "id": "JNHVvikjirDDllCKotbJC", + "id": "YFlD_rDw6IwCctPG9BjYf", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1025.9545454545455, - "y": 275.68750000000006, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 848769955, + "x": 510.8409090909091, + "y": 209.70725915875175, + "strokeColor": "transparent", + "backgroundColor": "transparent", + "width": 60.17910447761194, + "height": 60.17910447761194, + "seed": 1369151980, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "KGVjVuaPc35r3zwmLpo6p" ], "frameId": null, - "roundness": { - "type": 3 - }, + "roundness": null, "boundElements": [ { - "type": "text", - "id": "8Msc7tXcZdg2UUH2NmUn-" + "id": "JMprrs8mNVD4CrqUlVm7i", + "type": "arrow" + }, + { + "id": "FVhCmDYbWjGck9rgcESwp", + "type": "arrow" } ], - "updated": 1726708934863, + "updated": 1737528663639, "link": null, - "locked": false + "locked": false, + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ], + "crop": null }, { - "type": "text", - "version": 287, - "versionNonce": 1779271564, - "index": "b69G", + "type": "image", + "version": 375, + "versionNonce": 1533627624, + "index": "b6f", "isDeleted": false, - "id": "8Msc7tXcZdg2UUH2NmUn-", + "id": "7R-AwuwB2mlKHQ4TA3v7g", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1040.6509142788973, - "y": 280.68750000000006, - "strokeColor": "#1e1e1e", + "x": 507.5390491822035, + "y": 280.3521455223882, + "strokeColor": "transparent", "backgroundColor": "transparent", - "width": 17.879989624023438, - "height": 25, - "seed": 1297532739, + "width": 60.17910447761194, + "height": 60.17910447761194, + "seed": 1189477272, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "KGVjVuaPc35r3zwmLpo6p" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737528662023, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "A'", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "JNHVvikjirDDllCKotbJC", - "originalText": "A'", - "autoResize": true, - "lineHeight": 1.25 + "status": "saved", + "fileId": "fffa228d79e3bc7053142e0031890d5aaf369b8a", + "scale": [ + 1, + 1 + ], + "crop": null }, { "type": "rectangle", - "version": 565, - "versionNonce": 1888269836, - "index": "b69O", + "version": 804, + "versionNonce": 602477288, + "index": "b6g", "isDeleted": false, - "id": "fkbHGW5tJ-Ay0sh8h-9hJ", + "id": "e4ecV_y0ryxDQzzpC-xuB", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1022.5, - "y": 182.05113636363643, + "x": 1480.6454339460893, + "y": 499.97869318181824, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 2116216547, + "seed": 1087979672, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": { @@ -1821,40 +2558,40 @@ "boundElements": [ { "type": "text", - "id": "BNiP4zX7PtFTn_e_5vXX3" + "id": "uQnFGHOdIKBjcans1vzUh" } ], - "updated": 1726708934863, + "updated": 1737530585213, "link": null, "locked": false }, { "type": "text", - "version": 308, - "versionNonce": 1814172812, - "index": "b69V", + "version": 548, + "versionNonce": 957607832, + "index": "b6h", "isDeleted": false, - "id": "BNiP4zX7PtFTn_e_5vXX3", + "id": "uQnFGHOdIKBjcans1vzUh", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1039.3763691295276, - "y": 187.05113636363643, + "x": 1496.7817975824528, + "y": 504.97869318181824, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 13.519989013671875, + "width": 15, "height": 25, - "seed": 1804210819, + "seed": 1242918296, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585213, "link": null, "locked": false, "fontSize": 20, @@ -1862,33 +2599,33 @@ "text": "A", "textAlign": "center", "verticalAlign": "middle", - "containerId": "fkbHGW5tJ-Ay0sh8h-9hJ", + "containerId": "e4ecV_y0ryxDQzzpC-xuB", "originalText": "A", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 558, - "versionNonce": 981967628, - "index": "b69d", + "version": 797, + "versionNonce": 102135272, + "index": "b6i", "isDeleted": false, - "id": "QYKbNgibs7-HxaNNr8tfG", + "id": "_NOEhFqnCLHtq6yXXa5Ft", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1024.590909090909, - "y": 229.23295454545456, + "x": 1482.7363430369983, + "y": 547.1605113636365, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1716177443, + "seed": 356776600, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": { @@ -1897,40 +2634,40 @@ "boundElements": [ { "type": "text", - "id": "C-rwFmAbwI_qgVqpkXy7m" + "id": "J3LCjL2uxV-fjOQWF1Nyl" } ], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 249, - "versionNonce": 1916232076, - "index": "b69l", + "version": 489, + "versionNonce": 1696742552, + "index": "b6j", "isDeleted": false, - "id": "C-rwFmAbwI_qgVqpkXy7m", + "id": "J3LCjL2uxV-fjOQWF1Nyl", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1040.6172797463155, - "y": 234.23295454545456, + "x": 1497.8727066733618, + "y": 552.1605113636365, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 17, "height": 25, - "seed": 592678339, + "seed": 1964566424, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -1938,33 +2675,33 @@ "text": "B", "textAlign": "center", "verticalAlign": "middle", - "containerId": "QYKbNgibs7-HxaNNr8tfG", + "containerId": "_NOEhFqnCLHtq6yXXa5Ft", "originalText": "B", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 653, - "versionNonce": 1248546828, - "index": "b69t", + "version": 910, + "versionNonce": 580876520, + "index": "b6k", "isDeleted": false, - "id": "m2Wj9fp76PKCAhrulCmTa", - "fillStyle": "solid", + "id": "JQQ2WM4JRpHcVDQ6tWh9E", + "fillStyle": "cross-hatch", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1027.318181818182, - "y": 365.97159090909105, + "x": 1488.4636157642713, + "y": 601.899147727273, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 901963107, + "seed": 1170748568, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": { @@ -1973,40 +2710,40 @@ "boundElements": [ { "type": "text", - "id": "MNgTOO1UYazXucNSjXZ_z" + "id": "-t96Vcbd_pHmWnfG-tPFY" } ], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 348, - "versionNonce": 52260492, - "index": "b6A", + "version": 602, + "versionNonce": 1943988632, + "index": "b6l", "isDeleted": false, - "id": "MNgTOO1UYazXucNSjXZ_z", + "id": "-t96Vcbd_pHmWnfG-tPFY", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1044.6645521684127, - "y": 370.97159090909105, + "x": 1505.0999794006348, + "y": 606.899147727273, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 12.579986572265625, + "width": 14, "height": 25, - "seed": 1223112963, + "seed": 1023795608, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2014,38 +2751,38 @@ "text": "C", "textAlign": "center", "verticalAlign": "middle", - "containerId": "m2Wj9fp76PKCAhrulCmTa", + "containerId": "JQQ2WM4JRpHcVDQ6tWh9E", "originalText": "C", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 127, - "versionNonce": 1292352780, - "index": "b6AG", + "version": 365, + "versionNonce": 1829772264, + "index": "b6m", "isDeleted": false, - "id": "J1KVE_C00rdGo7FWIwu1X", + "id": "VdLIGckmm2zBfC3i4wvrn", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 998.7954545454545, - "y": 188.01136363636374, + "x": 1455.9408884915438, + "y": 505.9389204545456, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 12, "height": 25, - "seed": 1442121325, + "seed": 973467288, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2060,31 +2797,36 @@ }, { "type": "text", - "version": 181, - "versionNonce": 832846732, - "index": "b6AV", + "version": 424, + "versionNonce": 1974063512, + "index": "b6n", "isDeleted": false, - "id": "TIEDsM4QhNNDJARAJnvDz", + "id": "KCk9Ks3UrLoOid_qWtcKt", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1001.7954545454545, - "y": 234.26136363636374, + "x": 1459.9408884915438, + "y": 552.1889204545457, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, "height": 25, - "seed": 846611715, + "seed": 360471448, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, - "boundElements": [], - "updated": 1726708934863, + "boundElements": [ + { + "id": "uJzNGI-VzOHyMa0kMCtyo", + "type": "arrow" + } + ], + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2099,382 +2841,289 @@ }, { "type": "text", - "version": 229, - "versionNonce": 2066541068, - "index": "b6Al", - "isDeleted": false, - "id": "tGvqUuD_kCzfMYn-UX8o-", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1004.2954545454545, - "y": 283.01136363636374, - "strokeColor": "#e03131", - "backgroundColor": "transparent", - "width": 12, - "height": 25, - "seed": 758667053, - "groupIds": [ - "ssihZCwGeFNCQehvjAg06" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708934863, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 8, - "text": "3", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "3", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "text", - "version": 360, - "versionNonce": 479971468, - "index": "b6B", + "version": 611, + "versionNonce": 125066984, + "index": "b6o", "isDeleted": false, - "id": "IQM8OVr381UGBDKQtda8U", + "id": "uc2hgh9lXoidExmskulnJ", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1004.0454545454545, - "y": 371.26136363636374, + "x": 1461.1908884915438, + "y": 612.1889204545457, "strokeColor": "#e03131", "backgroundColor": "transparent", "width": 11, "height": 25, - "seed": 618433805, + "seed": 1906124952, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, "fontFamily": 8, "text": "5", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "5", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 611, - "versionNonce": 430626572, - "index": "b6BV", - "isDeleted": false, - "id": "fJGd6Pf-SaTmbDMUGHhUW", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1028.3972327492456, - "y": 322.2812500000001, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 1491526540, - "groupIds": [ - "ssihZCwGeFNCQehvjAg06" - ], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "Ax-8fSsrXvrkMhlGAgJgO" - } - ], - "updated": 1726708934863, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 302, - "versionNonce": 1859392908, - "index": "b6C", - "isDeleted": false, - "id": "Ax-8fSsrXvrkMhlGAgJgO", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1044.423603404652, - "y": 327.2812500000001, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 15.219985961914062, - "height": 25, - "seed": 1943704076, - "groupIds": [ - "ssihZCwGeFNCQehvjAg06" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708934863, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "B", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "fJGd6Pf-SaTmbDMUGHhUW", - "originalText": "B", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "5", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 259, - "versionNonce": 2035385356, - "index": "b6CV", + "version": 552, + "versionNonce": 531850136, + "index": "b6p", "isDeleted": false, - "id": "07qZABiLS71UbigBsFpnK", + "id": "vbXyYItXCJiZ95GHEna2G", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1002.0335963856091, - "y": 327.2812500000001, - "strokeColor": "#e03131", + "x": 1432.8286670338025, + "y": 661.083806818182, + "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 11, + "width": 197.33984375, "height": 25, - "seed": 1965424820, + "seed": 169629080, "groupIds": [ - "ssihZCwGeFNCQehvjAg06" + "D2eYatwoRT3Be3gQajaM5" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708934863, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, - "fontFamily": 8, - "text": "4", + "fontFamily": 5, + "text": "C is marked as spam", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "4", + "originalText": "C is marked as spam", "autoResize": true, "lineHeight": 1.25 }, { + "id": "-CNAjEmW6cbufb2V3aXbb", "type": "arrow", - "version": 2600, - "versionNonce": 1259679372, - "index": "b6D", - "isDeleted": false, - "id": "M_WCuesgPRdSQ_zqaUtz0", + "x": 1388.4659090909088, + "y": 250.5312500000001, + "width": 113.16269233010075, + "height": 228, + "angle": 0, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, - "angle": 0, - "x": 1113.5321305627851, - "y": 279.97561555378826, - "strokeColor": "#2f9e44", - "backgroundColor": "transparent", - "width": 154.2895204048931, - "height": 2.3372664247598323, - "seed": 1489010356, "groupIds": [], "frameId": null, - "roundness": { - "type": 2 - }, + "index": "b6q", + "roundness": null, + "seed": 1354092264, + "version": 165, + "versionNonce": 464680344, + "isDeleted": false, "boundElements": [], - "updated": 1726708895234, + "updated": 1737530583905, "link": null, "locked": false, - "startBinding": null, - "endBinding": null, - "lastCommittedPoint": null, - "startArrowhead": null, - "endArrowhead": "arrow", "points": [ [ 0, 0 ], [ - 154.2895204048931, - 2.3372664247598323 + 113.16269233010075, + 0 + ], + [ + 113.16269233010075, + 228 ] - ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "NzWqph0M7tEkeTDKLPGZR", + "focus": 0.4253246753246783, + "gap": 5.000000000000114, + "fixedPoint": [ + 1.1057692307692308, + 0.7126623376623391 + ] + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true }, { "type": "text", - "version": 176, - "versionNonce": 14571020, - "index": "b6E", + "version": 1099, + "versionNonce": 1108693656, + "index": "b6s", "isDeleted": false, - "id": "wkavhEPwz2TNGwf8xFeLA", + "id": "ocrQNX8WLBEF3z4H5qV1Q", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", - "roughness": 1, + "roughness": 0, "opacity": 100, "angle": 0, - "x": 1263.0335963856091, - "y": 188.2812500000001, - "strokeColor": "#e03131", - "backgroundColor": "transparent", - "width": 12, - "height": 25, - "seed": 809955212, - "groupIds": [ - "uHtPh4-PiLJtgc-p_Cdgo" - ], + "x": 1506.5825046192517, + "y": 291.4184149825713, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "width": 135.80796813964844, + "height": 58.225670034857664, + "seed": 1216046568, + "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708942969, + "updated": 1737529134305, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 8, - "text": "1", + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "5. document\nquality", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "1", + "originalText": "5. document\nquality", "autoResize": true, "lineHeight": 1.25 }, { - "type": "rectangle", - "version": 538, - "versionNonce": 1071049484, - "index": "b6F", + "type": "arrow", + "version": 1524, + "versionNonce": 2138633960, + "index": "b6t", "isDeleted": false, - "id": "Qaz1byDgzm-0ZrVLBmU4v", + "id": "uJzNGI-VzOHyMa0kMCtyo", "fillStyle": "solid", - "strokeWidth": 1, + "strokeWidth": 2, "strokeStyle": "solid", - "roughness": 1, + "roughness": 0, "opacity": 100, "angle": 0, - "x": 1288.9545454545455, - "y": 273.1875000000001, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 144156909, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], + "x": 1450.701621813599, + "y": 572.658384798537, + "strokeColor": "#2f9e44", + "backgroundColor": "#b2f2bb", + "width": 231.1460407851796, + "height": 1.29512872695625, + "seed": 772325608, + "groupIds": [], "frameId": null, "roundness": { - "type": 3 + "type": 2 }, - "boundElements": [ - { - "type": "text", - "id": "D2HbgzHXdGyxGppwaWbBy" - } - ], - "updated": 1726708966705, + "boundElements": [], + "updated": 1737530585216, "link": null, - "locked": false + "locked": false, + "startBinding": { + "elementId": "KCk9Ks3UrLoOid_qWtcKt", + "focus": -0.6425776620043193, + "gap": 9.23926667794467, + "fixedPoint": null + }, + "endBinding": { + "elementId": "TL7ufCnIHYiHVmKWJljll", + "focus": 0.14400907570834828, + "gap": 5.546510718694094, + "fixedPoint": null + }, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "points": [ + [ + 0, + 0 + ], + [ + -231.1460407851796, + -1.29512872695625 + ] + ] }, { "type": "text", - "version": 296, - "versionNonce": 2108300212, - "index": "b6G", + "version": 1200, + "versionNonce": 800272536, + "index": "b6u", "isDeleted": false, - "id": "D2HbgzHXdGyxGppwaWbBy", + "id": "AWSDUNN6IaU5NZQ1ScgSU", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", - "roughness": 1, + "roughness": 0, "opacity": 100, "angle": 0, - "x": 1303.6509142788973, - "y": 278.1875000000001, + "x": 1276.7246173511853, + "y": 540.4184149825712, "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 17.879989624023438, - "height": 25, - "seed": 2062418765, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], + "backgroundColor": "#b2f2bb", + "width": 124.44776916503906, + "height": 58.225670034857664, + "seed": 1343739368, + "groupIds": [], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "A'", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "Qaz1byDgzm-0ZrVLBmU4v", - "originalText": "A'", + "fontSize": 23.290268013943066, + "fontFamily": 1, + "text": "6. removing\nspam ..etc", + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "6. removing\nspam ..etc", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 569, - "versionNonce": 509454732, - "index": "b6H", + "version": 896, + "versionNonce": 1019725032, + "index": "b6v", "isDeleted": false, - "id": "-LxVJeZLqj0MgI5FEg_pm", + "id": "Rdnl5GxK4pFbFoTLI-oOG", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1281.5, - "y": 179.55113636363643, + "x": 1164.6454339460893, + "y": 503.97869318181824, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1514803629, + "seed": 1661634456, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": { @@ -2483,41 +3132,40 @@ "boundElements": [ { "type": "text", - "id": "trFDjiJr6cfNlCSEKqNjE" + "id": "gfBsltp4ourNC3Fnk9ClO" } ], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 311, - "versionNonce": 1054115124, - "index": "b6I", + "version": 640, + "versionNonce": 674323864, + "index": "b6w", "isDeleted": false, - "id": "trFDjiJr6cfNlCSEKqNjE", + "id": "gfBsltp4ourNC3Fnk9ClO", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1298.3763691295276, - "y": 184.55113636363643, + "x": 1180.7817975824528, + "y": 508.97869318181824, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 13.519989013671875, + "width": 15, "height": 25, - "seed": 1674925069, + "seed": 1149621400, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2525,34 +3173,33 @@ "text": "A", "textAlign": "center", "verticalAlign": "middle", - "containerId": "-LxVJeZLqj0MgI5FEg_pm", + "containerId": "Rdnl5GxK4pFbFoTLI-oOG", "originalText": "A", "autoResize": true, "lineHeight": 1.25 }, { "type": "rectangle", - "version": 566, - "versionNonce": 713594892, - "index": "b6J", + "version": 892, + "versionNonce": 1875358696, + "index": "b6x", "isDeleted": false, - "id": "Kxu9owye4gMpRvh7kJ1Nl", + "id": "TL7ufCnIHYiHVmKWJljll", "fillStyle": "solid", "strokeWidth": 1, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1287.590909090909, - "y": 226.73295454545456, + "x": 1166.7363430369983, + "y": 551.1605113636365, "strokeColor": "#e03131", "backgroundColor": "#ffc9c9", "width": 47.27272727272725, "height": 35, - "seed": 1938377325, + "seed": 1393525144, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": { @@ -2561,41 +3208,44 @@ "boundElements": [ { "type": "text", - "id": "UP92rSYiIXnnBFhov6WNx" + "id": "Qs_O62O1HCrusz6mXeH8i" + }, + { + "id": "uJzNGI-VzOHyMa0kMCtyo", + "type": "arrow" } ], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false }, { "type": "text", - "version": 256, - "versionNonce": 301317812, - "index": "b6K", + "version": 581, + "versionNonce": 711060120, + "index": "b6y", "isDeleted": false, - "id": "UP92rSYiIXnnBFhov6WNx", + "id": "Qs_O62O1HCrusz6mXeH8i", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1303.6172797463157, - "y": 231.73295454545456, + "x": 1181.8727066733618, + "y": 556.1605113636365, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 15.219985961914062, + "width": 17, "height": 25, - "seed": 707753165, + "seed": 500928152, "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, @@ -2603,206 +3253,125 @@ "text": "B", "textAlign": "center", "verticalAlign": "middle", - "containerId": "Kxu9owye4gMpRvh7kJ1Nl", + "containerId": "TL7ufCnIHYiHVmKWJljll", "originalText": "B", "autoResize": true, "lineHeight": 1.25 }, - { - "type": "rectangle", - "version": 593, - "versionNonce": 5355148, - "index": "b6L", - "isDeleted": false, - "id": "KMOsOR4pOx-ute2ztnw1k", - "fillStyle": "solid", - "strokeWidth": 1, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1293.318181818182, - "y": 361.4715909090911, - "strokeColor": "#e03131", - "backgroundColor": "#ffc9c9", - "width": 47.27272727272725, - "height": 35, - "seed": 635317229, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "SsRO-f6mzQzf5jQOudz6C" - } - ], - "updated": 1726708966705, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 287, - "versionNonce": 800311348, - "index": "b6M", - "isDeleted": false, - "id": "SsRO-f6mzQzf5jQOudz6C", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1310.6645521684127, - "y": 366.4715909090911, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 12.579986572265625, - "height": 25, - "seed": 1382819405, - "groupIds": [ - "bDrNCHlMlNcEbIn9yZXly", - "XEHMHITFJTjudNYgVFCPu" - ], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1726708966705, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 5, - "text": "C", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "KMOsOR4pOx-ute2ztnw1k", - "originalText": "C", - "autoResize": true, - "lineHeight": 1.25 - }, { "type": "text", - "version": 206, - "versionNonce": 745735436, - "index": "b6N", + "version": 457, + "versionNonce": 351906536, + "index": "b71", "isDeleted": false, - "id": "US1PK13ekocRlMvOrHSJL", + "id": "h9eneFYpYcKGCUroEQPXT", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1265.0335963856091, - "y": 231.2812500000001, + "x": 1139.9408884915438, + "y": 509.9389204545456, "strokeColor": "#e03131", "backgroundColor": "transparent", - "width": 11, + "width": 12, "height": 25, - "seed": 1525760780, + "seed": 2119562648, "groupIds": [ - "bQ__H1TgpJXskAm32UBLZ", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, "fontFamily": 8, - "text": "2", + "text": "1", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "2", + "originalText": "1", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 241, - "versionNonce": 1274323380, - "index": "b6O", + "version": 514, + "versionNonce": 284743576, + "index": "b72", "isDeleted": false, - "id": "NxUqy-MsYDga_9XDrU9l7", + "id": "2FH_CC-PbldTPMTV0l3zg", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1267.5335963856091, - "y": 277.2812500000001, + "x": 1143.9408884915438, + "y": 556.1889204545457, "strokeColor": "#e03131", "backgroundColor": "transparent", - "width": 12, + "width": 11, "height": 25, - "seed": 1116920372, + "seed": 3375768, "groupIds": [ - "4mN8vM1PMjtKHfzWdqXES", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, "fontFamily": 8, - "text": "3", + "text": "2", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "3", + "originalText": "2", "autoResize": true, "lineHeight": 1.25 }, { "type": "text", - "version": 240, - "versionNonce": 342262668, - "index": "b6P", + "version": 639, + "versionNonce": 961809896, + "index": "b74", "isDeleted": false, - "id": "lSEPKkiY8if2M9pDun8DS", + "id": "tn954yHWPQx-IDIpEMxaF", "fillStyle": "solid", "strokeWidth": 2, "strokeStyle": "solid", "roughness": 1, "opacity": 100, "angle": 0, - "x": 1270.5335963856091, - "y": 370.2812500000001, - "strokeColor": "#e03131", + "x": 1116.8286670338025, + "y": 665.083806818182, + "strokeColor": "#1e1e1e", "backgroundColor": "transparent", - "width": 11, + "width": 135.03990173339844, "height": 25, - "seed": 932194828, + "seed": 1349893272, "groupIds": [ - "Z8bVLPerSCYHViV4Ld1Ed", - "XEHMHITFJTjudNYgVFCPu" + "xRJf_6pX20sfp3DbcQgRs" ], "frameId": null, "roundness": null, "boundElements": [], - "updated": 1726708966705, + "updated": 1737530585214, "link": null, "locked": false, "fontSize": 20, - "fontFamily": 8, - "text": "5", + "fontFamily": 5, + "text": "Spam removed", "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "5", + "originalText": "Spam removed", "autoResize": true, "lineHeight": 1.25 } diff --git a/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png b/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png index 851adbfebc0511560625bf5e7afd35dfc9ef9d1c..f40893ac1741337134eeffbe1c83fb78c68a914a 100644 GIT binary patch literal 127061 zcmb@ubySsKv^7i|8tLxtR_Sh(mXIz9i9-nI&?QKhN;gPINavwj8U@KiE8X4S!!PcA z?|tugzkj|lo-rIl@tkL`z4uyk%{dnln(9hVFexw*5D=cID9dXjAfUhy5Re4XQGvhl zg6N_Fe;~foR+2^dHcGXHfIx?!A}^!kX}X(<;rUAUx^Ia~?T23|*fBU~b+eM4NCEPO zJtsVpGg4N|1_57F-at`H$(q=J7({#_lN%mBbV!Og`-WIv0h5T$shL-$=58Z{fJ@Uc zJ-t@tBx@t$?I)kMlgy>2&ePT~Pi|NT)$ zB7%7_{4YNM{L`g4VkBI$JtX7*{bT<6fL?GG^8d$MX#`p}jPcU5{;%)OBwdxh@xOmp z;22@f%-+e%f&Xo7z)NZF{t@}#ziasf#0I&!C2;%Sq6w@D2oui#Utto;ME>=vXjF;g zp~h1yJ>KQM^;in_*6ss6B0;eN5(Gvqm`;p?wv6|1q*-I=G;a5dkL;9&BFI7|5TyKn zKOGIgGVi0#2=nTEGbFjQ6`md}uxG2mASUA{NHSuQ_rVwP0k-e@Txp!`=JnbNoTqW- zEOQq+S7roHBt_9BB0fP{#s7kHy7x3@_LDR2F!Ews5xhMFR$1smx z5dKC4cHp_}^5xMpB2rSX+7ctx^?BNJ%jx?i{>A_y_x%~vPMH=2Jw~iyng1a|fgs-x zflEyqnOiVRjakuMQxj##UbRZ}CXHkV>pBS@|M?XhQjV^f3=rF@SG^#Q}hm$iZ z(m}oxP~98FpAMY=VhPzt<@ICZ9Nn>-gjiZWBQ-sK^JuV_~GgfKNT_iio8vt z%wEJ@_ZZOcs^(o}$%l)guduV1?Vm^JT7C4$ly-`Q2Rq3cM-#R9KzGn%6iudG-}(Q~ zna9Nd$)6n}#;v-&ciYKY(wte!S^`UtGeBxJ*w*6Tlp=N%E`AyT(za|KvKN>zn!-G$Uw75 zc&Nj4p4|w=`bhTrL-z?gkU6azibql!WVKI1dt zA!?P;FoP%u)dkLWC?ZiyA}$8*mR5#-NSG;k5Q&g!xu>{=^~cH;1d1}FB4H9_;2qdlm$@w)QdD%_ zknK0MvVwPhDqQuD1wG~>4n~(#wzBry;I48bty&MxGDXJPi3w{4cX#(xbZR^VweAnv z(Mc6T_NTTwQ3boUrFFY2x3gEx-`($@m_L1UfDgN=99YB4PNS=B0VH69tufwM=N`6R#3!eD zo36*wh!TAu;ch`n!|PX5iq*+5vt+;{lK;dd>{dXVuMAkFy#1R;Er$^YZcD>#~{)GD?V$uYCVr?I!Bl7hQ5uUceZ-xmRnH zaP=V+QJ=FRxwK%*<@JAijUhZB`G`r~vEdA#s}g@4vF!d(uv2NF!r`b#nPxmq)NK$_PW^oEh=rv55N@Mk@m^^BXj5^M#rDdkL7{(1>Q14-3jQPM`Ug9AB{8c`XE~f8oDP&qZ03|>(H?ES^^X!+r z;lh58dZn|kn)F{wZjReyKC#87#rc#!$!OW|wGwooVHWXz=|HJM>BjQn12y{i4&QDa zk$j(mZW}`))r3noeUI~uQk484(3!v|(HHQA7hHEA(Z%80MW zYkcHM1S4jAgFxWI2^4Q|?{{+`%97I3;Tl~vKp~MmJb&{vG0F00mIV3}%&wSeW8`wh zd^_QyL>1+O#c=;41%;}aRM0j7qAUJTtplm!Z;;K-_fLUv9AT_bsAH*7%{dhe3 z2s_@I5tvFL;kKQ!Ju`f=NYQ|IZ7xCM&y12gVG*em1+OHH5u&oOZm|4_EseBKFI_b+ zK}8-4QtoI#|5EpsQK+!&=6V%p#dE&#GJP2IrrgV55eGXsVW*UAGJoZ_JDKL%wg@;{<)5Qa#@- z$bvqAUR2FfC&Ifbj4mPlJSN-q3*_OzQau2CYGT`;1#Dj1OS)Dwsh(I$G$*^_ein-= zL;jomxWm5fDZ8a==09=NQP&BB7gG3b@U)!@CHWVT{cg7A1cwkL*-a8CEYf~|bL;02 z&;6I8WQmqajEcLKNgXm#_rR^9?D&+l?N9>&=Kr|&&U8SqpOk9ab0}=LOkM87W9;Ci z)S?RzLPtF@oLeYveG&M5uMWpu^MUUlQ{(A6Vx84rlB7nv$1!#-Qt6frQMWTOpar#M z%c^;s2CIXyfSjDl8MvG-87~(TdH}2$1F1G(ajxR$)G@zOtK0!L~l3kA) zA%FD>U%-W~i`Jivqp1HI)~onL_`40+QaVNJZkyk~ErwgXW?GzSn`^-_P5DZUvO4}8 zj@rB6SpxigMfT{PeWqm?Xq`q82B8O_64^Hx&gB}ew5OYoE1~Wp`N06y1cy39X#WW_KAombfrK=|pWZ_^k$oD7u7HOZ^C$ zftv@A5G1wf`q(~EVnHMj)&Ld?o3!lR8JcF;TMsq;lqLJCQN}O2rJXnUJ@!TzaNODF zqF*AlsdOYIhVJT{${bCb4Pk&F+aW3BPjmF>sIaQUAlL3>mR1r-3a?XqYaB@XcwYx* z1S6g9jU!e_-V4hjArr?Xr>9}2cL!~5Wa~ec9N>tBk5!g!?oSP|hJaaZOX0Dn{j|x8 z@D{Od{V~ZuYX6|UEav3~%Zt>|cdnb=MZ^;xGt@IDXFR@_T`;y`9_c`tu%16p@aCV< z=-;-oTc-D{M1?w-i>HQK)F|eC_fR8_VHJI~s}~&)d|^3=qhxpS-67jCmVHUJ7~Ztm zI}{|OS|S*{xf)v$$>V+C*)Ta-k!sM4@91o>_o?*hcW)gENN2=ac`F-eGe-V*}n5C zF7>134BbiOPTjE!r$(xJUd&nxpLWYEi21S2owms&ght$I_YZ=Tr=zJ=ItW3VvwIjN z(EwCl=0W;{j218AFFq+FgT6nFi&CoB8UHzC^ZTbU8Q?dp75?JumTz&f55vFw1A`}F5dwT zJKHrd=r!?PM;BmKmDX$kho|w$B-lc|-30PpX^UH0xt&h3-&&1QeNGu^Y*51Gy4`(x z3SOcadqTZUQ*{G}1`Q~`T>(gnd*$VpnY%d$nu=|~|5ir+j7caOTbo_ZJM%ppV@Y*? zLwY0iPi84zdt9RA$rYOL$y$Rj{}xYj^2N-3{_DGKrAGQD0;Hf~iOFeIINe%i6K zSxB-!Cjy$UJ60db31e(uh+r|hldYQnooJ;*;KB>kwf*1U?SSxQF=|#@V^y=L$+|4H zh5N2z6uT$;L%(isTE2rFA|fyFKuD85FJBRJ9?o7Yt$n@Om#YZ@MW;*A-hKRdIE;+j zkwa&@u^{kgIH>KN?OckEN0paxCZoB82j*(9?wUfBxTw%d>%&wmwdZk+(7LVH-!(?# zD5Yu1pOg3XM2!*ui$^f}_ymcn09;5MH)k)H)`+jrXamET3u06z=v!$+sBZgadSGE~ zN6FHqT^*eZ+8INs^9$V@M;1v^DaZ3C%Nw<<=y93%c>LbabmpYWr%S}h=^oUFJ`%T7{D(~lW4UQ34R{eSYZ@#P^MVIQyEVYWs zUSIr`S^gy>wrL(2qWZ#4AAhB&#QE2=dsaJr?mIj|QN8=xQN$IzW>K}D$e#;r^*!0P zTEFwdr#jszbRvQ|fFyvFp3SIvdp13gCV-L)g@=X*+!J!3K#=;rzRHb@lvURLI4^vA zx>G?W=1jvNC&zDlYWFN6s{3IN9>st1CpVx!HsyBt{rIb02&FUXu`^N^RLZpvJ1!D# z;N^9E0`ppubbkw{c&8MnD6brDdDs^x_2JJTr&s5Tzwp@Kd(o`Y!P~pP&>Q92<4Z)2 zdvV&NTQIZt?qy%dj&M+KqWp zvLkqDysw%9+GarXhD`VB{?1U&!69%F);#^4%%`3mJ5De6ISM`|ffn=PH%XPzrF%QD zB+rY|3LdOBEk`bvg?G5%)+5Ed&xD#B8*W&uK^3n55jn`QGosn#kV$Nrx_Eev!sS$(gypKZi^jX;B8aFW@7T zgZ0lvD}l&L8?o|lGmcF<9V|scLsizVGpnm%NBi*S{7n5ot}6}<%%8+oQWwJRJR~b)E|lc{>XSF21-cLdJQFY0{w(}O=ng*)=uhE zZS%fmg;4=R3abw~}o&w{@B(6I9o*{jkE3 zn8DyGfH(})Ai&ejP9XJfz z@{v;8eo)t4>=S-sZbg}4C^s68`RL{Z`0MUyiWF@pGAiAdfCu}LRJ2clLil{N!hyp_ zYq>a>vM%lZY_IDuKYU@5*T~}Hg*@oKi-gCJa=Yz(4)Obrw&eSsoKydWtC5ZyEdacu zv&2Y|!Oa*NOHf*OM8v_q=m?qpzL?5P(cVF@HPh@oF2-wI8u^tlS_aQcPnq2~^Izp~p6IPd3+eSME=JcaxDGldaPD%$f7)1B>}{t4S) zE>cCxq62U)wDWuZk5Zmx)q0}%r!@Op7;WGP@|F!KdIdNkzjzl%jXYLy&)MUpBkCh7 zFNt_kjZ*3O8q4_4kntI#cVQS_OPz_2jMOp47OlCQp>**77Qv;r@M#|iof_}R4wP} zcFc?fonrLbA6S^wVeg5;bgn|=b~2LaPfJkA`9}Ci2y^XL^w-ndj@b9KwQ6B5iulcW zB^E>gr)aglk{4f(a~1k(Dcuz=wW*;$EtQgy4JG8`%|IEN-f0#4{%-cI^FC%b;n=18 zH`R|os|iI%uM#tSO;BsMBBqi(^lU9g2v6LTuNRzcB*Nk=g2Zf8Pv`K133Ra+HNcux zJQ=qH@PY!0F~MS-Fb&F>luC2NdRoyCrB9!J!zpG_P%hWoA3`g+2p((Vk7{K{=^ZyL zEV;hnRBcfH%$|S($_ukkIHDWTdrLm%Ga!YIq)SoSUY+@gC0>{pg60E>;8G>NO`@$nEG z-qfD9^?GSJ+GxF*cT02CiD#+Qq&_cIguyp4Oa`tjuY<-u(}&`5Y&wfS(3Hd60H4f0@` zIC<`SRC>!(yKphseM##5!9zShq-?$iyz=wz2kc-KUeVUPTNz_%#lMo}fsPz4l{TUP){; z!f8suFDK#$+LMh_sBFsWW?; z4w`eDm_!Q;D8<~!CTZF&#vOptD`GHk}(#jdv3 zA=MIJ8j0?r&t6NM_WQpdw1IKS09X^^#&1G!h>t|x{eiy4NT}i)4vcDWK`lM7k`Ju8 z#kfK7vn)dz=$;uJ28luzJxCeuuQZCErL~j5+OJUilFgDRbvT>~e`s@=)}U4dT(fiO zaprY(jpxZA8??gDOG@#Ir>*9yt??`yX5Q4B%7^@*Pq?oL(Aw2gO+dU24J_XN0FUkR z*ja!Cm9orWRL#4evJp(*^ucSNcZOI!RS==y37r^rEFsUS}#j7!F#7ro#Z z)WXtl3ZsEy*1zi+rCCdBboH=w3P_3_^+FsUWH4uwu3gcuT%R(u-BRVPiZzTmv2D(f zBhDJ2FWw_#eiZG_{VAst85Jh1XkCQ>p=6jYS6m+R`R$PGKm@xx!v3A1(s2Unp&h%v zKErqGev(ByoXNO(#%I7aVAS&ZI;c89+JsK?c1nLBO_Vqnx@3}X3H{m|-0-WsJ0ngC zfXL{kRQyOLMhzdD7fhV~E2c8jNqVTXO`nv>t&*K$R81vC+LKRuwu2~!V?htK0zUm3A~5W|*X zYdiP;T-!o(o!`!2_nI&MCfop?fxU%lrO5>P|4{n64ZMlHE0Jr|Ap0D;@6pf|&)E-Bk}*MpQ^b1yt$3 zK*9Y6>3y8c`}Gf>(=gB3+4<9t21|5j+?EmkMAM86f}O>~QeI`U(DA58!BQ8R=$#Zg zjw2)2S?ZSd7h^U);Pl?Ldh1>~%X*k&9qi||`ryMH4Lq8U{?ZZ<00RLC2m_O*8qz)i zOaaAn;>;;N-8ZaOG-1R->CJW#`eN&yZ?9nQzC8~fE-SzV3L|cO(5Dw#ir&0AxVZEL zlhg6zE31B{^aw5b;`07QgS0iCGo5COYkn<1M$?|4d@MCq;{F0Oa)>!OJFb?;M4ZRp zD^l(H-5C9A$G+5$t6%BKl0n-aH{*cqCr(%0g3x(7j_xrzUqtlnTnC<2rfO+O?sAuP0>I;VTqA}+sH#tQvNY}sAZ zGI{iU$)v9&e7Kzh_d% zj*!4Orq7>AvlO&@bv0mAuWCq$&iTv-%u{ANDoss*RNh#euO~O%($rF%9}`!>H(e{} z$$Qil@hn3DJ_5;lPr3i3d{r!<#u7RDJVqFHV zn{RI--{KjaW3Gh&zJ)6TmnwEUE%3E>1s1pf&q2XNR|0#g7#-;OtFMQyQ4WpFur?mo z+5T5B64CuRW#q4KiA|V@@x-}{wX(wn%^*hugMkcTtw_schq*@K$#fCIW`}QI593db zoD&nXO#KqJ1wIj4LJ#BVTg394S#jbnz(^3WNT4s9eYh8L6cN!xL1nP%!v=Y5<*_+i zh}hhn=>u#4&`ED8L>>0lJE7y^^eymPt=rVSNZ(pldDC);hB7{h=@3sAkh7o21WZ6iO~+ZLH1s)VDOaQ(XSQQ%>c*Pk&>feVyYQC*Q%KHU z@X;G_gg;9GVO!(5C;TBAQ00evnzhajre#i$ymdqI>1ln`E8P`a<0_=IB<`lSCc5TF zWJ`4B4f%&PWAPI2Sg4q-mmYdEDfoHYqdQ`$iAOpwho4r`NNn?08R>Wg)LEfD62!N8 z3_T%XR`-M&bX#){#J{;k)2nL|j)tEC;?E~Q{5g~Y_`f^n5fLkZ4+#Y=Fhe$=t;8ro z9ziC~(yh=WX=}l?UB-vNL3^3aUux(}m!ZR-EgQN}tf0Wp#}VS?HQ=@M*6dBPcYu?v z3^lU9xj0%hX==eTes`IvEw@mAZ{iOIkOMJrb0IY^QekJauzu& zI)Iib(zD=V|A~`)NPo}%X&IOJ(e;k7+bRg4!2#jpsE<%12R^0GIcVh0BkoJqC-ROB z!pZU-foVD5(e<;G_O1^&#h4gugs}F&<}aWEj3)_x{jHTfeT+a=3BPtH))C~4hAxDy z>N({;r@%k^4ja!FMul|-29HW4EPWLVhBzx-Ld|5;KbaI5*!>NZb+%s9$p34g(7eDv z2Q-x4sDeyP zv^@MHL}Y+Hff5!c%}^6WcHv}dP8Lc8L%|B0e-3hZo_9~?F&{U>F@0Rn^Yppj?+>$k zy=;5lOd}9hDD5Q3xI52_?`hY*&{Lfw$4bNk56hm)6AlyAv=bMv^@hRL{maS!^wq@)30BHqtAYw~jmd zJj1>FvYtwqZtvx4Uf$tqIKKAfqW2*kNi=2HuaWe}y@KdH5-HJ-YV84l9UwnHlFkpc z)V-i$x#IH2)kYi4M%G2W+P#IV$q8U%#7lGI7st+UYuAQ`bdk^?I+^lJhN1{KguOs{ zX(=Z>X}|eHW3U8#)ojRJ+NO)(zSaMmHZO1Xy{vgmct{}~E(QMD!cHQ6&Ws5;eH)Oy z|61O?LPt|mS>+Q=SIVc40q)w&+yEtZ1v zxMK?XK@3T}@~Vwyk{>zae)a6fG^*n2Gk5%Z#(NooMDmL1oAWQ#qaQ^3OK~kzr3En< zl#j}iP8fg{x5q`HyAf+r^!-V3R6aDSkau;AWRu=x=7Ew`h`~Cj@(zsb;ri(eV)4>@ z_L7n+z^z} zF;*Rwm2LmB+nnAblVyh}Qpf z*wlCtlhi53j2|i&Q$CZIzERigojfVIO##mC-!{iUD4vU9TDYx+hq>JZ(lrS;-p9}G zv!GCfJx9_9E=!#Pk+{-L&!P?|NjFd-qtd02rf%SIil|&iR=N=jBwPHe@vbBUd+R-F zlhgYbD?eNJHnREeV}x7;p4Q0zv9F^5FiVmp^i!j-!VUwEJMrBZNhoO}k;NNZg2R_U z<{#sRxo+#ACm5lBuU9(gN?J8B^^n~hU6)2*su~iKz0Hs+lOmv@jIvYvqW}B*8|98A z^xLC;trS)3th>b@?xGTD&)?l8DLJR2o&0_H{zf+R?xf#LWAdImm~*q5zuieR;o^Nr zW35T(YIuR-pqE6Y$l1~}We>t7>yV|@2)1>sV4&;W#~?Jvw`|ZJ+qdecv?xcC( z%@C4i&c>?(Gk;%^x6X9&Nf7MbUWLW@!7C~xWaNo2Dz7RABOW=*zaLL2M0(%+TB7l- zH`T`%Zfcjlupd1(DV47WY4QxX#uXJZX_eYYsC@6CV96~^>+`Qzpf54se`<8Qow99@ zWj%pEpKcWA-br8_0NMd=%Zmg;Rz8ar+1;x%{e_99r&)f7GV-CJnPVgNH5L;a*;eOD zv=%*qh-A=d6ZCO=lTj1*&z2X9t>Gs1s#;VK;N}Bm+@BW`We?VxqW;k?g;XRN;~eRK zcwHln4p-~vl6`B;6gx^C5i8?lu(Een+6UU<2-POH^mbotJNgN32^n zAKuFFmPO{@D)x6uRPI;_-0G)IXmDee*KO#w4?XL_5bimBDx8Yx5{8A)>iIY+zY(LzD8oCEi#&oGX7!qha>lNvTccQ5;_ zZ$5TqAhk$2G5f<^$+RN)!<`7idPts(FQ+MnJ12#QH!An~>ym0{>OddGgaOZD*0dP! zv_v>hHsUVerTS~Pn3hS9uC6NYZ)<8l)cpA)J6}m(JCcr!h$zrfDdl3wFr%LNQR?MO zP+0zM)5pS?h=hL8`CiI}t%rN)RZ_jdQ#v;ExxSdOI9t2vZ`Fss+5gy}wyA=meGU@a zZeH~4&jmB|?n9RpR+G(s>0weO1LU9*hNt{){r$PvaC$>Wi=P-*J5`W`7+UP=D%QQU zD;lqwJV;hCR@%H>QnUXTl>N)$4#{3eMlM@@x^bjdyh$K5%L_}&c`-mGAki?iZ`w3y z2UaHw^QXp3T$Xe|d~5yVnxpoXHy#Yv?!o zWWVQTROu>~Vxq#hhCHTPpxl=*0Hi;e!Qgr2+N%haJ;Vvv<}<40UZPcAKv`AL8Jcne&Yk2uxk zSd$`zjK@$}hs|5uom2G57N%dkzyk-q#_Lojuy8o%qOX$H{-ZYk_ijof@em3_(`#S( z;%KNV0-`8U6)#x5^Llk-hoMd#jr9pKNuB z8PxS~i>8q68$45G<;VQ{#7i467BBZ6@cqq1>V1qV9)H!|c&r*pkIhpQYu_WEzD}JL!Y*3QeIiHzOJ| zm%m5{B%ON7yKizgDkahRKJU5h+Y{=saQE@_67RIRJqO+V=hg)>AcZ=&qO4ImcZ}zu zpP$z9%4{D%GabzRN`g)AlrIcI6ir4WJPku7IC2D5va zMI<#0h!rf$l2!Be3X7+tnx`xM&jnXbEE~wQN<&xvq?B8qU z|L2n>^m_d}U#)%UroP!3Px0^v;L_}1G^*_ah&Zqe?L()Qj*iu*HtAiGeEVm7?t9T4 zrmZ=cSmoL@Cux)5W9@3}aRlON}!LLDZsQyTH zoS3CXbO|L-ZF2GA!+JLn5z%)SpIpV zYEQP3TrZVsjrau6|GvJ-gnWKi%mP_niiXk5qbV!DXOjw8&u0fxp1%aMQD=P&j=#V2 zV{m49H0(%#7FZyZVGFn?``PyHHpG!|1RYUo>{jnV4rg7LL$g1h`F(}er(ZP{RL_Hn z6XDqwKTDilm)dEXI|}75y=dp$@Jh|@@cK@<79{U^7JT(di=W#kr;7A~hk^rg6LSlO zTxSyg(>Q8OPuBw`B7Io_)}PO7t^EpA{dh^8zL`^;K0l7h%9C4@f{Q#Mm-dxPppvh};EK&yFNwzo9 zWI=r}C?|5qB9LVKRXUt3p?%zNbFoia`FOmnkSHb{ZwVylJq(j+9!)gU#WBN6RU6an zWxyNY38^q^M5Rpn*6m?pNPjpjBhLR*W~mV~?7NL;bCl1hCya z>Ke_um+F)v+ImqKoP@xE2*N~98IkrDEwdd1lnlqFrehvMdQBsZtk1vDM}bO)NQHCd z;L&muim>7MJ3x|FAAOaBxc2C1oP}jD`K;V%NW7<)19MgUok|V?lFieybRovI-^m5b z+>+T{KfZ`&|K#BRwS#!n*ccI@)RWJ_&4NnK+r}n!0!W=5E$>(#XibR0;=sLKCU`vf zV+-T>=PfN13b*=}4a)M?d~}u0v}E8d5E;j17-@VuhxJ1%eXN9E*y=*)%-z%v-oxIm zG5XskUT{#Z%;wD{!h*Z4?IyNp@0v*O{?m#$g^T5Mz)+6xpCKy7K?K}A4fdh&U%qU+ z!`5e_GpKI8S3#1O=SH{g>*#Z}c8EuFi%b6Rzt-x|@N%?fQwe}1=Ih?;>AdG~B!Blu zdcH;EAzp7F0lve6_y{-C#Yi7zAPme3vpbP?Gy0E-0gdU>3P_EfYU0C6vC2`%6CoX7 zyIF!2tA5$}FA=bx=Hdj!VxW-ayOS}t+f^*U$;!m5io(kL{Q;`~@|1+MP96)3(+B5C z`RnCx;l{wkf+oTnYZDItowG_*)UM0z(v9+$^19H|+P4e(`TH5685kIoY=}aUaFxo9 z-_v%-i4kxHa=;6KbaNJZpOjwE`n`7~)3TwG;|kJ?T9(;BA$!@@__mg!c6*k@kCrxk zhI5suCU2&kj2<~Y&=LulmttmSnb{MxmHD)qLX5)3!!K9#lU|ggi}8s3Gmv7X&}ZD- zG#kC2M{2*a5CN2eaC8G^a!E%rcYQPN(yt{yfB0M|?xyfh-B?`-AQPplrZhW}t7k|= zd0F-4_`hN5@n>ay`t)c5x{9sHcB)hi_$ne6kuTs;^Vg;jgNq@5-_N)BiNJqm}d zRB}cM2mIEa5c!>_-Wp?B%+3qe6Xrop?MsMOLtooo?+Jl$DSF1-GNqo1$UyPpljPEc z^6l=t-%7n{+~xo+d9H<~>tpg8*|Az!RFaS+JuwPOI@Z4mpCJ|6$RsnYYM7%9P@b5# z7tBB0$WCbRbe%1naGLKD_TV_(n>%{C7mH$xO>p=M$pcXJc1mzCKI8@noKk1r`5UJl=X(`INlWe+Hj7KYS&JM?sCc^80Ql)~wR{%x3rDoHb-B zszc%xjdT>ww6yMs6vxnpzu!g#o8uHw+3xvI4c?+XVo-HW3?6P&@9Ko_Ijy?fSK`f| zt&AwnaUX452#R5#kUi;NpV|2iW*OUY-)>KRst3icT5cs69VF`Ki0L;OFk{-(w8)K` zG{|j`60dw0TI~2*#>M4(tnw@}Mk4z|Wzda%KaQ$tGtmk?eTo#M?DRzMM>K5!HaZpw zpw)B^w0T4azTHqK12oNuhilT1skjE9xpR$*v5Be&CknpkyIA5S90XhyR-9d1@qruY z=rYYNdxk`Q_aXp=&PG$*Qm(0Bd13a1?mAL(RirT>023am{~lnU+A%IeXSkDI6O(+% zS@$3J*hNUx^YNeRMNJGnx4aeKu;{f7E|#3T`;AkU)-0>%vh@7QbQ5s$u0>w-vLw=~ z8bve~9(f)XDB%+jZyz@T9&N>bfG-h5di%Ct)7W^u-ZL5NbPw0(sC!gyb3UK(8q$uT zm9PP(J-r;DdQ@iB@hayN8hl0ENO|Oy+x^3-gmAVGU4+Z@)?2Swj+43N0MmRIjW>Ep ziDAeaYx;71UM8UQB@ONamf%NAMNy0*7mXCn$mrFH=v6x49l9J9yS#)QJkoAto+l%b zk*Lwxek?*`Yp97HtJoV%0EJ@Q5lJH;A32c6KI{8Sjs4D=jNTK=;nU$tw?&2oWyII7 z&-uCf0so>>2fO;v6I+e!w%yy1fHu30(pq9LlTQ_!I_w22^@u6q$ACDM2b}8|;uqqsc znW@Q~P=0gsGLD4vm$&o!z)&ITsDb=MKT_BaSfY;~6Ld>y{? zy@vJit7rBG0ye1EALUF$WW49D3_915LT7DcNgnfpU!?9E=Ry2hvl2Fo;$JdF-Ycy2 zucZNvAFhF=;99qHd+&mVq)eb9qeQEiu}}HVPSJGtW%dI=%4X8}{E|WH>SA*4M6w$* zIdrQlxzRMF=o?KB+YLr~R%S*!LO|eRQe$(BlvGKJiHif_lvv^9K{=QQBi9S2t2?)6st1ATs*UU6WY+G<`8b;?^J0uUFKtKC{cy~yM8MDsJH7mdK`ulFwi92 za%Kxs3R#|*jmtO9zZL+?KoI?B&z}l%8HCmJkLJven{M9BAb`kt;LmDGB7}vCv0qEaOj(?lqRFt=HvApjhoJvWzZ41XUsd^ z$%UMa3CXO?B&;kOdkA=HRLYWJPgF( z)sHpL7fjV98CKcs7hOVn^>w!^FvfF!!c7ux4gW}NUCSA#Z}#nk@w0`w_F0sRKxEjRzM+WY#X8<#kb zSQn0aw)GxxLE#e@x{L+)$bTvNGT;h`Qmd)M8JQqJ$7C~ZrBL|#wUSB-2cS((%Z(U) zXZSY1wNYq(EkWUI)u0Wz*yl!ZE>K%Y61f+aK zL+-4Ab^Oz}OQs2E?E|_x&>a}0ASU8Rs8ztfVRS`ms}yXv9lvyq*!X z3d|F(n-GCqDS{5JsHb^uz6gHUNC8>}1bd@hk|;c);{J}s2J@NLwu=KY4F&+1r!g*9 z8fqO*C=K}Cc>vam&E-yXpUqq>!O}8y!F>`~DDMSeq*TUt+|7ipCive0CL6i%<-EC@ zwS%_qxG|ZqTZj~Z5_Xi&Wn1-4gT5)8oud%|((n@@ItCGgBka)U35`Z;I(j6J(hEgzUsl}P zhWR7FpP$m=&fX;_%NMZwdTXQn%kRR%F5hh;%O&4efAfz2NGPsIlH8C|chTaVsaLfi zy;uCZXAD&vt^ARzlZv(gb?m)mWqj2;UB@rp52zJ~1c1{nghx6`=B{};;Re;s|-^2mjVh;Dn<7Js2sKk=@K6LS+xNf%gnyf`;&lczU`FRFw{;%l@OU7SB48>& zL&PJkJ6`l9yc$ZP7iU&59oN=f8?889Y|V`cm)>o3jkIMK%p{h){TCGw5b}nT9J$e?9b)iS^on5 zH$^M-D%jT0gUlxnN_%%>9N;%o9I_%h7wtl=AiMp5YtS%xytR!qSls%nrZI|Af)5W`srk#JKE|b2?6+aNF~2}ff6?yWB{EIGFKBZ9C%33|oCV?F{*Ja`4| zTq)7^E{|ps8>FVhQUx6WgTamdtv@Db|D`~cJHjw7TW9vhi?x2Rp@vDO?!Fr;7W?$R zw69I2YJNj67Gs+EakAEVuAcU*nSRS#Bd;uP69iOdXr~|a^y|wM5mMmdg@Sgyyas%? z*;6hKl!mc*>53inL-7%J>YbDWZe7_U-{$8N_Y>BmW;@P>usgSgiitv#(zP%t32g~Y z+m{aJ!(2}`@?~1RwRhFSBV3&Ti1f|XiuZsOI2I3?(DZ?$ z8t|1dP_}@X3HrTZaxMxfn(1d?->mFP?(4&OXa0*%8hAMq${Gha)25~26?m{v=Z3?T zX%;57$lg(4>;8V)UIIfaiccKO5&W~$gY)!8eC?S-XZ+6Kyt#niTQX7T(abKNQ`ob= z&G&Qb)b7{r;I{ip74V-uG6H9McpSPwDY#V9n#nxlJ-sHvR)BV?mNtN?<>BIzX!mIF zy++1mBj^pGZK!0v@;$MswVT0notU^h?x&q-buzj&a?BtnV-u<+q+BR}I8u6a?-h_l zEsU$brYWA<=J@UXT_z$zd1uKCJWMX(*X+PQ69sCCm)=_G9!PtJ@e42(08?7BXirL# zOth}c8vx@7z&M8Hrui?CuHj|D^QwJra^;lwXYkJ$7}n`6*9Oahy9=P2p1vAjwkc8! z25DVakKJCaqli(?7EiT=dUPIn5;S5X?tFa}ANRB9H~#9z@d#w%fHZ94Gi-Cc1)(4& zE}1mnQzo*A8gO2-0>7$5@2mNfU-WCaSp%MpMXUxsOy_5rtjAcHN&^CH7&kK+wgT7ykO7W6!!cWgVbcvp3)rbJI zj%L=~-!B+}8JR~zD6|1Q7*dr&_T>qUY8C9oq*pvIUggFghKJS*&BIQ#%H+><>-F<^ zA`q@gditpsOTbK+jBC|Gj6JM7mW*@K<~ob?|-i>jXn)H z7T@`;Vlz<+=$4iKFCZv~|pUd_ri2lXfX-an-0Kc(flz#^!5mm?{nu0O=XXDgZRNi;FMt6bE>ETo?0oUJ`-Xu;Ff@w;S66-{mM zvHRaFtWTP^`|j$bd_edygo!VB;v@|4U|^= z4ESXU9W?plDueRyoPJRBL2Hw&7TvQ*FU zFue%C8Eg0U0Gtu9!w-zRYZhYlQRxy{z($Yr>oc2$o6DEazUPuRlSRsZ|M1j|spU?J zwEd#92L}!KhtuF*3lgPX(fFoq6TsVj(>|A(_U?&SSSE15;Lxvto9bM}wtj`1bkly}TNl^jIxs%4ssiUor=uKgN^m zqNoyFXfniGYWdt+k(-fCh3g*$q#C^Ly$^aH1VctSA~y!js#>A!L`gEkFZs*vMypd~ z(V^ec~#w}J{UV{10{Qh^cATGLu^ieSYX!~FRCZezUc^`;H zHa6`5SL?sTmvW^R6=mBgd%m*gz9>*7QK}F3#?JUxzC{oNYYWU4D`av3LN06};22)c z2b_%tp~u&&@m1ER+js$wV?HRy`|S#?e97{(oc!`$z)BL^(+IXB?fWOk#P^PG9jbv0 z{)adzqWzfMRsRKM7pMRw;Keg)X$#_?&~g0LZ$Kwr5}$Z}uHu>S`v%ylk@8tS`07#a zn{DpMckfmkH*=M5uTE329iZQtpXuv|hu&W=OL(q;zG!EJ?|g4x;ypT=V*3YYOSsXJ zs=S50Y0`bF>}t=6UTGMw*X5^6I%Nii0L7e~TErp7--sf82mTpLmwG=+I6UgKFniBu z(}>8z*CSyztL%WMKF!Sc&?miup`+gzlCSb+HI;z)r945vILXM5Ync<=&dPijnj`K) zY1MJ<`;5llQ#v$~c`n426(>i4=qYU*SUhBPz%;D_l zd}SD=%6p@@91UbU0Ox%{?Y?l5angdgb9h#v-0GzhLOAd_YI;Uk4e3Yoi4~MwA3e&H z_qYbK3;pf%{i4X_Sm@pJrB}CCJKsOs^~DWFh=d!HsO#$Xtpc00B7kx$!zs7pXMu9T z_{$a+@fX@k(RP3p_<@WyNNMiJAx+18fccLS(6>R$z_tdl;SA_kEhM+!J2rN}fADkL z@B8U$&*ofq&IrCqZ5ydJk{^opOAooi{b$KPS&jEJ`Sw%nRb)aK#)CK4yU#E&4PB_Y z{l;=XqJ-_fPVIYRd&v5ekWmRFXN?iDU7$s8E{?~xCfj7KsO9K| zfB$~pyS7pD{-8?qf8Zh@6&!VM7HS|Z|1Aa(KrE4|;Vj$(7b_HSWDmu}v{`BiXt>H$>Fvs{%SXU!_Ko_ul^*OH}k%G=e0{6lrcyHv1R*g zUby^4JAO;k+nn)V4nvjhJ4TmxThe=Qffd~$6=PBonBC*0*7=kQv@)Q9yaQf|=jgMn zNjYh0x=&^`hGjUbjO0#;<~kPY(M&eNaD)ftZl9e>3`k-~pL5?Fy~3oFWlcfBKCElRUeGBKgH3OZDYjEXXv z>zgMcvHe@^Fm!M?Xp%|?$*bGK&P0)sHA;&GO1ex%1nd(Rn7e)*8fmB%6g(u<7hc2(_INl@u|FD z59i>|vg|`-^YtJ7r($AsYyU+~~YPit6w_ynZ>v{7!zv zw#xeA;CH%&m+d!86E9!JCDfR+lkHrs)|cXsAClVIuQo;s2#Md-A>M~PR5Gqjkw5eH z%K84mE+_sIQ)uL4(dToiIEV185aSey_+w58jq*E|7AM`y%hOuNpmWQY6|K`A-R=)1 z+?U%dLv$s*{y^WHp}Ftn=YKdqe_<}Q5%>TcsP_O6`Xj%D1VjC{KqW(HytfxQmnV~4 zvO(0RR)K_AHcUo68}3zsBgFFmeV9qwNUnLVkm-JJ zG1{`KbBYA52Ap>%=qm&C^zyFD_a_JvB?x!n z=Kem1__F;hTTRxiy?Dm4OaKBsy5Q}&G1|Hxuph_P2~~T>~4!!O6}+IB|N^Nyf@c)e!Apa0%V-<-z6nn zjlemW^1G`B9mkHY1#417;cWm zRvbPI|1JCGgv&QUQb0flGX+OrJg$D}jh1`MVXwZ&JD1$krOz5Tmw!G84x(pk*2AzF z6Z8OHTD||C^lQ`nQRv;J+M=3;;9km*Kyx|DB{?^@VhLmN2$5DWv{%-l3%lL;dkktd zedfd<$#V12fl-bxV|@He*|dVMmx37Pm%XJNbG4=Ey+1tQE!E}-E#{->l? z0V)%Id#|Mx9Zat%0zOqr@0AEypYJU;8-^sd-(B%8@0L$S5z=v-F9m*wVq>SRuUGDz z!Ldb%84z||Q5t74WO!&I_o>0x{F>gbOEOya+UaAgdf~Ae`xTZBC`*_|f{ofi90vAU zN`|Pz>BflndF{*~e(;t0I_dCj@)JGn*k3`XhLg9o#6M)M1M_VI^9?%oUU{@WESRx0 zN3KemW+2F5N)z?>`~x-eX(xp3n(NJh8r?p+R5Ajxmd*wacdg13A!eM##^njw-E6+v zwKFdgf6{sIooMD-)mf0{zN!1U?^ZeA=wiJJJ_3;@nJ%VvxThCBFV-=<{X0?gA(l>D zr?aaoc<=9v+E*!%-`i7V#pyFfqT)c%Ek?SpvE#UUjsR960q8>2bAPC34*O=AMBClf(0z!uf&t5)@-)V}Ms zp7%9fU=5~|BHSD19lNYR#+A%?AWF5}9BI{xMtkvc1IC-Z#j+*-sWM~Z;r%R+HdA(S z5tjwl{sd+{6^Nb!8h8|x0a9*Zwf9rTFG?3>(Tx6E1@8t8jCx!{hq}BjSHu-A#y_*r z&~$dn@=8gahw@3Ai`d!z%FF2U?i)w;_b==x>>V9nFINQ_w}EqDM?GFHowLrnyj@rx{^vb@BdZ2WskK&DkTjjO(49PdkYWo*+GYI>Gb!50;91yGjHn5^+rYa8(i zs=JVraW#kjgoo}P1I2vEy&P#6&lmm>MJ1>ZZp7lC(ZGa`(B`+@PR=^VdKX>j2T`~x zrPJu2|ak zH)LAfa8re%_acbUGtD#t{oai@*_gO;-fvM#QpUT`*6$jTjB8EJ96VBG$NtZ7+^E!Y z?>L-Z5MG@l7Flc@1`*E`;ySq$ES?Kjm3eAa~Fx!x3o>KTbq_x^jCgq&~T zb+}0!8gtX%@K!3`lFvA#tz(5f%|r;>I31v>IL5fHDk=2zL{1iA?85Bpu^1Q_5c1D- z0ZDU|{q7vMXIX3>ORf=%sTPvWB=Z;)T^J5new@J47zJh`Fty@b#&?=P)AA8KwyEX1u1>W4AyvD9K}^xcZTFODJUBc1qQ9D_9Zho2eX zI^Y!v$%%bK@<2V1ZCtKk*U+2cpvnbGe2HyqGW2_@F%>cY7VudwJLB~&1PBLLkeTr;VohHNzILw!# z`DZz>rBVeq)Z*}NhJ4~y)-pOk!CTL8 zr8eo)c&lj9s-M;lVM~er*q;jAO?Dhd*8T!`_1XQUZVZM1?ogVR_tNpMci%3VD=~_( zB!Q7jwDP(;p^x475)ad2=EgtM;8jkUxUcOBNzcjj@Lo4;IjtMUS-KBl*-@X`R8xPM zwlj4flT@tn=b|1Tz8FKcob=nIC`nZDIVgWQStZX%L5h-&v6y1^$32D=@Qht>{?n_ zPnZPU%m~q^q9u(BFO*I^d z+KuIp{XAm{vn-eag_5vcucy5*P?~qCXjHw`txP`0hIl3$ZQjG@C) zU2*t9I{rc1lUF2gQ#y@?M6yfZUoLVQ%jIz**&*SL&Cx`^x3vPCjo-^8qpgO7(j0FU zg3-3N6}ndV^Fc!xG}uJy3tQvwM;Ko3HR|71^|2KyV;}n1v^t@MFM045=4-81Da6uMOOTK> zFQ-=cCtdZTE&tP1+@Y`L&ndF3zBk#g-7+*er+pRw>`r({C7-Beu0FB{(|j^dl>1UC zOvHvS!60_c*0ueZ95oX8fEgO>X%Z$xh5`fu?54H~T5D1=ow~8463h)T+BJK=_M<^w zL;3f%Mn7}q?{A*V-JU$hbOl2?oO#E@id1))xP!F7;>1eGxU@Yu)g}#_(;D&!Mo;>$i5GAGmj$RKnI)Usdv7``V7%>nZbFCloAGhVb^0~!B#cAyS7Y(Q zve~Z`lNrhBTf+-c6&YjqmXkSO?Tb$XWFjkBYd#c&~|u&}Gq36{8k}>(^qsVIBz!y|k z7_rDkTGr5_m)b(x34l-?gM*($oEvmRV7Vu&7FPmIt-{3>aYhL>mOytlBd5yN*EaLg zwnqtJx2}2ry{It%{9&Yf-`-|nvDP+XpWUYWSijMDOsQip|K5VRw*68wC-<-~k2LVw z5)=5Hcar?Ql1|%!ZQ_zx_#$YLTkUM{S&Bw-=I6P6%e$c`7%R`X=o5ck;H_+@h zZ0R13y7F-H)zd|7gce>QxNw(S)DUKm^{l0}kIn3QHB)Lb{n73a|HzLxcA%fQE7%7p zO*=Zq%rcNUNJQ0Tv8nd!WO{Vs0Q-tG0hY)|X`Ka-38kP_`J>SBTr5wOcse2Mx`)+J zm9R6Aybu`}cp;@|SELn$@+FyxPWdgH48VQ;v7T~L?Tbym&&v&vEKa@MX>RY>+U#+Cm01|oF#D;ULz$zGVrFnkSmQ#hGhyT_&nSmN*RlV zpIvQTvF!paiSv_+xd-)tW*9!+A+(e8v5|~qzSQW7y0Qkpw6*K(!Y2bu%GSFL^zk1P zV^3lc?LT-ryy>=5qw9kp9jW#kX5P}-40$OK5FyUh%SZrxHzL`*_Zfp z`S7D(Dz@r#gFbx?*r=fZndj7P`TRnY)_I~|ka8II#kzc-z)Q|{5m1w$YZ9C2mZ+ zs;`N5@SZ)$wYdwOd@5cFYh#Uxf-#|AZGc*Q0}r+KA1uw0^}1O$$m)^oO`qMUOUa znv4vQ)6(KX?&=>{LxZEKx~Y7L^&7*9PJ7L|d;1eA>{YM(9-$Z{eZS_o&M{!mZ36WP z{9k<{y@Lb~$fa;enwlbfc=SZh&?(8Yd0HDw!SDa>YWF^Q>eyt#`FJU-M8I%vE|0xp zzY7z&_v5j=#>Ud^WtxcFsw(CuDhFD$+~xcjLg+y6^7_N(duZO!ksNr<{piA8!+pvC zVJ4nfBX_B{9T(hV7a?cO#f**QOp;Xa;H}TY*aHnBE1)J2wm;Yk*s0>}U`}gokUHoN z;+kMU@RQ*u)LOHK{Qhc3uqWzC86AcYtt3$3>x10Eh2Ra)88hq3a0!_omIDYf$q2XJ zR=v*Jy~@^-b&cSpMnWl>sVGHP$l+wn1R|L5$v8QTt*CVg*q+4{6vV3_IQXtMh8`3m z9)3fQ9gU$JDF(6XFsYxw5d*0|-~*9NT!p@!0*g{U%N4^oINzSq0;BM$1Psi@9;^4l zsNoNHvC&bFiBR$GqF7lsHDhwmZ$E`zAgr-Kc0i|T`6?|%&dbQCx5*G8>=9?rv`~y# zds775p3n$Y{l9fqq^MHn`98Y&ArquecQEk~jY4h1_1I#_0X>+SEZ?RNdM@?B0OSm} zL9PNCo-w#O_9^Jc)?C+_V=VXw2MzgZq;NW>|Hz-TP4G{8Ny=VakZmt%9bMO(ji7M9 zh51-Z_*|@EW>KwmN4Qdw55#+t2sH@Bpq@XV8zX8TCn&O*>omdsywjh^`l?_^%w=0_ zunW}da|2Z1DXzG4pU~9fd_LjYv9%UdyyJ4Y}JDTw;us!ih7<3 zLY2~E8&`QLn{c%38-pb#G3~kR(2y{1-CHeNy5qyXYG6;Z!>}+Xc z*UEW>mJ<~q4JrLgNPC3?lWf7qlmi391_KHFt)a1D{!TB$d;#MtAC6Zm2Sl1yqX-3*SArg)bU;Y-;5H}NV-(mX^~#F(;wzW z&n-@U7qm${Xh|CZYSQ@Q+s6RoNsp7_$vnK9J8{+DMOu|X|~u(f(IJBYg6m`G4@yyymtNUnKy9)|Cq zSJNSD+Fy-zAxCV4hOIO-q|pW%n(rsJQbRG;I^7?Jq=gH6ghh9~L3(|(Ew>t*HT`~& zJLR*65PLw90~6n2++et=w5objOzJ|q@en$j@u9NLcCUjz?2`DN45f;tH)i-q<9z^b zsPftPk@N6}i@fv4%~i3-pbgq#8|F44`2uY>GrF7rs!dUB5v*TY+q&Onm?{GgZDRue zvY!$}RxjZb@clGsPar7;aD0PKzBmx$;0J*jeyzGVY4H`dkZBT zu(=$>m2(_T#9dli4G1Svv@07pC99mU&2W-yk)mEdkKd89oy`I?oG$g|tg75$fUmk(l97d^0=H~9u6g0C2)0uJcq(S=-M@PJvZR0?J{aMQu#p1eUK*{zEYZ6T;Sg89sd3$ym|0KkvK*8fvVg( zi!P*Ol)BS*y2EsisEhGT{DX5WGxnX3z}LmbVzkfyeW!eh$+2>qt@Quqb_Avd$aGUJ zRrT|TEUSEqJZI?e3SU46h5zchjyr#>Z)Zdu$vmxFt{mGts_DG}QpMn^!T3;1R7e5y z4rh*6CCOqA)$4upwvo&yI0yFb{CmiZlyv=@)9vj=Vw&i4xhLQaotBzILqNqk)@5xN zy$L^my!@O>|CLeF!}10(qtrw|1nmYE&f52MOYl#(dpi`ZTGD*;OR|P_CUYaS|IAj| zExzJ8AFp?r8H_G{&Mx_Q%77-vtQL`Wb(+-cyOSx^RvCd1^GIzNNOG%eeq!}Qw@&t1 zmCVYiHOx;RVHL)r`jsf%FsB{D5Mj${RQjp-4Jkwuwz`drjPST*ukFAOH5ZVWQpf`ns zcq`k$IVS<|`5(llWKOsyKVoso_&z_uxk3V;*>llu0!@JPZBii2G+}0o9<9C!ViN9e zvDYvYCbTsR7EF~%Cv=HZ~=Yw2*gSK%HS*=#u z+8IHgSVx~QlHk7RWC5#(FDhm=Vm~_0RlwMEhCiLb^jB?9Z1+MltP9PC zPP8MA^A7A>X31m2@Wt7VhX}pzVbNr6Qt~DmeRgbRnvrWzOpRwJ4-O~2x$f=|R^P)e z92_+jlv?bkW0JG;HJ!vbrOk6T^D^-lC>)ode#Z-4U)Sfz*uV=$FrY+Gn$)=_XacIa#tdMA zm?n>eC%+XOBO|E%L|XTUY`aG-R>iCWSN^Bmnhtf6$pzR#6zd?{xiV{&D^HjNNnc79 zTNP_Z&1>T81CQ0D+qk<-h<^`zWQ1*jfD?6P6o!vR+u8L!{FwI|* zUg8Swb=vDe|FMQ>51Yao&?bFO1L7S2{B7BMIhcHlY?TKgdK15lO8GQocleYfX(x^e zmyP7d3S)k&1+TH9Uya?@;Oqe6mn%cS;*-$2;KF}0Y#p{{A4W?=0P)c%H{Q?3b-mk< z$z5nc4r^WSMZ%kgWtj_wLOmaI;smPzCT6IQ#M40=%aZHSmYjeI2dviD)9l&UdhlZj zraT&-Afa~a)}G9s^qC0BJiWU_pII+8TjveYay7j99CIlce1G~Aa_{oEsN;jmrYP^% zceolFN3%!k)qXohfCk19!_7@f#`HbJMY_z-?Jpo&daD)dXwI<-G3R&dXe~Ay+ggP# zeiq?SLu6x9zPw2+eEB?>>nnj1<}c#-M}`%0B_aO56)}v5yi0Abt>-c1w!-u=n^}9~ zQWHiSO-smBKPLXjo0jr5U7rtBPw8Xgzp!*nwZG@(O>De}7k;zkEN(adL@H+eU@Sg4 zVW+|`tey0F4xeWFOAm#uB62$7m3fSuPoROB;sWewUt4s88WcU<|c@6{wTD>K&B zBz#~GQrcSwj_o9}_lB}Ue$om-Ev0`rCbnAJV&Q-#c3*1Op+C0MM6R` z-iaf~A5WsE7csc7=)w8mATZQ!LiIY^t=@?g`vB9V(AS5X%2EbIK{}o9gP+ABCudb9 zlxQAy3Vt452T`qBC;5pgCz5i|6GW?gmSb`Nhj*=ket2sm?9SSRsyPoR6e#b0xnULd z!r+sj-#^rb`a#_tBEO}tv<^T_QW%M#U2Zx`zFzz60p3`0C&xO=N>#0{_ZZDS5C~iC zC8qaLW`)iyWytI-=~bVjh6l=|`m3t?>Dw1tMP1ki2Li3`7%}_fS`Y*o`$ZlD!rRNd zphc9AQ|)Vol|Vu0J$FdPTh9Iyj>@b}%IQa7vORo68YH=zg33tPmF@Kf*b~gsSPXh; zbK{c-ZaNxz9t_U{?o)m$}bPlo#565n7MHWV9CcWy~v^F0;m7 za;~(bn%c-HbHr4ErXLSG=M1L_$Lv#!2P|or4egOSsY+fzm^@`xq0+BFbH&ERb#4cVjm@~ZxGrmg93QD3_;w@1UYR$z)nWh3ex5TJ!%qI2 z!h#@wKeix=)FF$vjq!&13CZ7ma)hX??}ns4p3{RQ_F*9=eP9C7?+Q)}DcQ{G<&v}$ z9B{PI^i)H3S{L*a;jyGsZwtOqRshy9F#!vPh;4tyiu491lo;m|T@kNj>dK?zhZ_Vz zPTE2Hd#7b3o7^->8cCW^zqFKnbyYgikQ*9n@fB}_6G};h2}$g9ehuqIxuT1);@|4+ zdc#vntVn5$u^fh-3)5R3OhwqO?D}kev!U^bKye`>mtm#!#b6vMic4J2a(lZL;k%V^ zd^#8~QmCJ$g&&lbl*m0>RfVVuerLbVk2nE$Fi5Mf%r+v(*}g$ZU_GGb=da!kCBCaT zO}B#S2YtG^oD_5R4`D5xDGxJCAp3jgvdbiLzHaJ${8_eemw)%EXwb)7m)XD0@nMUu zJJ>C6|GNAtvO{1$yBrZ;F8LM+ntuD#JKGYU;{}`!WTb6j4Wax(xa%*M!~z!radXJc zfqF93JK6lODnG5`Dd0Ij5!&_srY_X{kK+wZi48E1~c8qNVR(4JoP zlsw~sQX*q59rb7))@J=mB%ZN@y@O`${)X08T>G!(w;AGLi$4-o9-d2UTYQ#ucXkMZ zvSIyLpJn;Ys%@Fep5Ag36^!Ol*KAw}l#!2X)@V4V-pBcIh=%-x7)^>%CV`G_#%$xA zVtZ&im=zp)DF_-#a(xtDZimNoi^jAN_a3@Bs5X;T=3?^Lg)iNL@Uu7V0CC&NLO;$< zv5{VNWUhAG18JTcN^M?Q25K|a#^56h>Y4$ef4SJ#?x=`Ml5tVxw@I9}Oi!!@BYlN8 zMDwdIz|0xU)T@cY{ym?vcCe0VR~QnqnHkDMf>dv%+QVn7B3B!tPQrw<--hXY)k9f__uHg!@qukfnQ_|6#>kY@!+`c2_9CAi z;_Q9C0S34~Tv<+tB@76x!XB@InFP6wl}Y(p^Y=C1rdMt7; zad$Kn+!#o(e>jbF?@6(atG22gO-l@WF_Rw5JNh<CYB{NlGt?jjUHy@FvBnGxrm=XHdmj+h7_Lx?=aLd{an>hQ2Fcd9H)|`U>uPWjHbuTd4sK(m`CO*hOJ%eZZ`>U zpKYHv%&-p7f{ei~Hsla;3?5KszOq^}!-|r}XIL+YtDIPfIQ<614E7co@akRhlgV52 z%ErbZ<0eEhDzM5$l&Z~xmcfqza>1C{A9V0Jsr?}s)DNX2IUuaWOu(rmk0DDBfpf&c zKUT?VUzNP5AwPljmm=T%d@~2v*Tw>et2gv73vpc%;v4X{VPH(L#qEPT(@8K|BnLy8 zc`?*6wI1}c#A+)%dOarDR+tIb&c;A`h)(iIYp;+yLRUkgp-sW+RzPC1FvvFPYlwzh5|Pa^*`0E%FRh z&~wZd6d)GrKC{(23l@aBr5q5-Ypj$)59NI|Zzg?U4djkXSbi8FpJlRV6H*D4uL2kw z>gm~|6EGm@jzKoY7%N<3!Yd*mj(px&XJdP=2C77^-Csvuj#l2@jk=gG=0N13HoJPFnP*|7iddcEI5i!*n=T)ze<8+1#byV+ZmcUXl0E!#Fvku|8)c3B9o0 z?P$&pfaE{&sq6InG9flC-?BXLtz#iB#|irR;VAVCLAj`ZsIr)?TSv>2{fW0lqh=o- z)BBJgi8P8V^OnXB&0oS7IXQKb`}-J=2P$@%9+iMlgwlZ>75nSjXMVrE+4f#9lt;!$ zJ5c%R=popRW(-XN98KpkZC)5io^1B;=2d4b7x^Bd-*jU&t8@!~de1gPl(zdqW+%~{ zMe?DBql~zvaP#-3P1;7c3rkXRdbVP+=l0Vj@oN_as}rG})}@PEp&B<9&N#z+{Xo~) z{i&O?w+So;f+tYhgm*?p;C@)D7==d15Fsg$FTLQ!Rjweq!6<%Q>(61iomWIcgeXj; z4xL2qF(}pg9GnjEM?yd-YlJg&^dhJ?mgbdGkNs0R^F14K3?Vv2LnMp?_BM24WfH1*2+Xj=eyMIs=jtRW^Wcf~%wyP)dTy#RH6I`Kp#iW<|~ z>DM>Tj(5cB-)NpwFjvIk2MA^+0EnF&r?rVV$L9EWZJAh`x~K@Hjd49CvP`%cgKEp z_-i0XDQIY6+qGkdE!fttIR#Dfz29}8?DB>c6zJE$JjRUCs)X<*jhz{-qpMR=zuj8> z-5Aa%;XXc{D91pBKzIO+k7YFGu&HZ}to1a8DoN%!yN|tY>*k}9QHvHX8f8GT`00qy z!kDfZ1;bT8CDfYet;i00OshiMOg$c4I09c@?bMke}DI9`z*) z0Z6Q7jT_(bSwv3e*Yw5QN{rCGr1;!yzCPlfK|&1o)GX{}p1UwGf*PhN_l5`7yszv* zknpwkJr;L%?YFB5F6WCm3!v=M)=!rK_d}*xW)z<=hzaNOn@?&q{ps(1ys9>cCGpS{ zk4*V>?)ZpfC)Pz5C2S4r=c~_h>5>1?rirbn#*uhoQ;zixM(c_0KixF*AO&nkVFE4G zf_k*jgU8j;;C#^Jk0(lAD~TCxl)eP;>CU(K6#m#`iBI^n4-UHc)I|Q51rSARDFoWd z+bAG&?OAa9pN4CpNa4Fi)H4tb(Cd%jTASp~1*{;kYHkoZ{Qy!e{C9n>BilakVLl}@ zO+MwYTkJ7-+ongPd)fTdL64-h%6I;@9a;!aS2ss3tI*hStcKGJu( z3oJLwkf}k zV)oQ!#5cxKZW6`)cw}yx_kl%4z zT*IkR>2$4&#(SPqA1zizhf>&Ahye~tHehmCRAFE78)GGSAJ6l4sOf_kil3zl215+$ zNy|X|SZ%7&UF_9s5t!R%%Q9<5jb{aP?7)~vrzbZ#J};MLYG*pZYHV4sBSqPwjfq9x zN+2m6Ys1MU@!;Q;^A2scUZ5U~E#C9Iv0?wauw`1W8@%b$bDNNCR$A1`RQ zoXk3~6BT7@o6O-Ulc&LpuewH@V0Pd;0OIX%e;_Y8?Gas=vuMCxQ~vs8`xBiIx`)X@ z;&ayZv(giCqn5-ZPgW%zVFZdQCLjBYD?yyh(Zl#3wmL~c&E_5fQp8vvW_NSfOP5d9 z5BTd#0-~>ET}|4=>c!8=qQtGmCpZ!T{MNXm^xJvZcu4@*vvjUe5 z3skJxG(WVH^Ct{TB<~-%{VPr~{R_Ls&2HGntje24>{zq8kF<~V$I6A(hkb@c3G6cM z5)~tA(K<^s{Nw7m@BbX>He`}TB59;G9{whGP=EX2rQHmVJw?A>e;I`Xe5o9hc|bu1 zA|&YQYIn}qsBw@4q-2n{9e}1G0#@K-fBiUbZ|l2TZ|lZv8QGh?+d{>B*6y@&fj}Nn zh|A{z%sBi9G5$TZzCART;y9=MShr&8_2t%H*U|U^cjrDlAH;6sJ+(L!9SUzRTb&)sF-+*}YVB!h$tb z)8gZevBbepS73;T*#gkn*L#7@Ds|ZOvO)oq1g4dN8JeBX#}S7hZ&qAHZD9S(V46g; z-Q#BIN+W*fO4p2;v%nVHIuDd?c|65@=P>e@SOU{rh;}%I@aBrreM}`;doC| zhs0a-p-Q#?p>3@enb;dWEa4dFd8}+dC8-2m6H@d>8x{Cqy~mH za!1^aU^lB@<%g~hc@)>vg?R#AquNE1f47k;;1H!Y1RO&h-vGF*ETXu)VY@kb->%95uFZ?T&vx2F=4zciAWly&U=NH)0K_$KJkCB9R*VOq<+ z_pE&35-mINJFcvIY1#f-S0kAMVTWFWOk$|rV+q2UcE zcf~Sv4%A7L=1+;2l=vw1v(G!`7ts%0I(3Xb2Nhr)CN@9e%(h>vlX>oC+_iAt^1W*K z_PVHJNH9yfv^@UR#1o?PzpSzNgp~XuvKz_1HM>`1Cveu45#YmH^?>vv^E(WW*pIVT z()flH!Pe^TtJUF-*mHYv{%)B5uprx4(a!lIuebWDhH0(tw`|dBP7WV-^Db90CnIt5 z@WdHh)Kbs`+V&09>=Vv2FtFo}SQAV8ux79IEn_m-3WipWVCUp|)+u(!Gu8GYU#_NO zZ=#KC_sW$xc^E^Ado0{?h&&OSoiAW^+xi?& zUh`Zl3nT^3mj&qKx|4($-k$6nwk^Qy=zbbn6fHzv|PwR z`Y<){KqN>x&s$Ftp4|I9X$#z^>p{E2T-1h;dmQn)snO|^`ffj&1GVD6F9!Jq;-A7D z)l^kH_!E9AiL%fCYBmvW9~!j4Xl7lr-xNRunX2KwXjf(VR}PmJvws_xBwhZTX8$WL z8E7BZA@dm-2v230M$4xfC`Y%}X+WM<|I#9Yfu1EMry}jSu_dn!U1;K0-u5(PB-xiH zCjXu((xYyi=Y4b7v;W4^t=K32QQf$`s2*ygT)4+;g_=6|)96TCg|$C~bZfxeBjKx! z&Cy+@4cT zpZ|xgv;K-IZnyZ%(2amJ2vX7w(jZ-mfOHJqgLLB%ib_a#Nr!ZINq2Xrba&jt`>wmz z{o(!vEY6(sjc4!u*|01`(2v8qZOVN0{74jC72luV%COYP;Pa#5s^ZMGfDXvi#y7^f zo$NG3B4646iy!?GQ5-YjHPRcm9V%+&B*Ezo9PA_WqxG$@t`EifRh*|q6Pm<2ug_fi!V0JiYxOkONE3g%V`b0^OFHP zV}7GuMWEfS^gS0B!A#0bS;eu>1(Kn{$ca`=G*^X=OZMtPS_IptsUp==dI3L)$gxvA zdDx=HyQfDrCi_QaNMzJN?FYcb5}mXk0DQDRQJ@}x%_#CHv_WIR1p=)h7u}CQa?+9c z7IJs*^sLDztCgfGk%;OI*-XfR+FEp~6bups)MP}D;R~~M6}nD^6HQ*58$Vx)b`t`S zq$i*Z&-7rf3W-g3JV$wRNa#pi3?!XxwsR9XaZ(6$JNVpP_I@QY$OCNvw%+1_+2IA| z(yCnLB>HC&9L+sK7o^kxgmE_k30xsE&;*q*-791Qaw7+-85Lz1g-UbJEP%S3TOZXU zrR-Z+sSoBAfuZ+Lq@N3ZcFo)1)FuI^tvC8kUE#8?d@2XJahK0B>Ik<{Dy%0XTS<9o z0r%eG*vV7_3p$3J{AS;42?-`k85N}iOnF6o`47VngTFG7P!!EA^ZtgC<(iJ|>+f$B z=a-P1DQOAhO(IxdWdaNApG&wI^s1N0Fk(Zyq6n7O-Cv>8MJvRt>#JXD(CqC}t>-97 zR=1wekgaN2`X{+eDb~AG#cxrAtD(xg<^ghiU1Y4ef<(!&yGoj*HaAyW3}39o7Z(y@ z(F35tf1jKcVB-(>w3!+dFTQ+13lX9Boz?b8yW93<6Tov8{$|)0&ee-u^7@gncene` zEVM@Z!)*5~eda??kT?q!LpKeOpX&%R>$yHTz4AkF^v#sv z^0@wt551Wtb_}B&K<;1t#C$+N2>gM9hG+%!9?o$iieI-^MS) z>nxSL`aqH3ud*S;)LuZeQ$-mF=~#Cl;0T8=BS+dsj@)*?`l2q_2=4{dv~a2li{V%# z3_{c&kGB_l3_y5XqxBImUDGl#DOe#Qh(|5JM$#ozfn-vyewL3Mb*n4GY|1E*eA@() zWHGSC`uq*A9pcR2A$-R`U?)SIFLLd-PnZ~U;z7#>UX^Z#X5fEF>yVbua_0|aZSw1&XS*&tyfHuaoL!8 zB)+jaWD+Pp3y);Ca#9mQ@(@RmJBnsfL5l8h3TjJSN`XuTD<)!t}mg!l$mn1ncS2PV`BCusV&6(E_PzTC365 z@<8$mX~lm?RF}!nDKrv-+7tDH!^weQ=HQ5Zvfh=~srH`<7ZsG3KcC3E6&qie6 zi~a4X?*%67p+I4y>-eE2iRbgX!62f)mgcFU8s}~JZPB4=H=L}Y=f~nat*qn55qqjr z6R3m4A?R2CpO3G4i+bSEVGF*ErSjCvkj9d(h8;x)rTDf3kUR>-&-v`;@?2X-wobRts z3WPI(P?<7k*TciZ%NgY-rpQi_i)h3&m)POb`p{|mZ(huN8h(o$@9fSfa7xbt9N-qP znNlNbnB_&SSqwikdzW^it15{7Xso=C-&LR*Wxx|unpDRQ<+pJIuC!YJ!2PW-(ik9j z;OMyDAK7eH7-M&?r5(ctV|-zf8i%lM7_px@N(X7cUw_}bpeYumyu2+VZ8K5A7_mJO zB}2p)OJ9^EzyCQ)5tz}lo=vK#-4Hz&^`zqa`JPn`{2ew^Sh+Iqv9vOzk# zpo+&mE_@uio~e8?+D)yD2}=Xr$MJ@uz$umA(bXSW~b z;ix2Gks&*E3=?B%L+U4`mKCVH;Aio%jEo840$~#P84!VvAr}`HhbzT%w5bY!mb^eC zJ(#!?R?E#z#>yRs-;Bc4(eP*~1f5m(l`?&=|BdIqOy~8vb>4IYS-_oU0No~nweG0- zj`sEIcbrY&LC#j|R(mlqF{C_ppt2FX0L0=PFJ~>yA1_RgV_@=X}2u&oZS6b1VBSZ(jt_{n4qVD@L2Y|Hb3u{Zrk; zC~Knj0$Z`K5`)nPCmuZ zf3O7L&IA4;v_5-(7j}c2Xfq%7 zKQBn`F6DvPgo3Ma#ssC9fj@hFkGj{(nZ@R&!0j?OBC^d*}t;O@j z9NbiQlp1d!#6%>A2bF;V*FVJrM ziLTL4MNH{Q`lD6$`pJ7{5wzZuWa@weKvLM7&Lj1lN2L|q$R6w}E~gE@Fq-o_awKC= zg_SY?;G<4>mrm$&bZY4^EXv$=vFi~sn-Y`4+Hy2Q4aB6z>RfEl`p?$$H%v)*ed?3n zx+u_#KGse+Ll=@xT*d&o;Obv45Y;BJ@R;QU*4fD4l0i&-R{(j%#tL4{Q@_hn6VBk4 z7%P*uF9?j~#t<^Ukml0f1FvD~xnn2mykjYuW{&+Lef~0Q#JSxUxv4hENS9sHSsRa_ zZ*t<3sA!h%v4~Z*a!kt$pZs;UVEMwgZS(KC9NbzD{|vC)ICODo9}jt3P36=raOc!a z2o}2f38A~9$un;^-zP|=QZPQyW5y<8IbP5!TE}FpZ{8b}$X78Q$pg_rI@Y1`vq67$ zb1*CSW&%>FQj8g-m3Q2g+eLfp99=b>B)yQM#S6kA_!-)gl|{=On@~dx?V7dlk})J$ zzBD||+AZk6a(f?^Wv?8)ab!%J?phkSZ1CaUMyH~`bG(_~X_;hRg_u}xq1H&4{b0gs zmhXT_K1_g)^|mUGbUUK*|+cPwQrqkN% zAS8v&`8|aZfx|s|r<7bbpMGvQ@Fz|8XK7B38fkLB+{(Wbat=>qZ=wa$OPKLA1SIjr z91Dk+XEjfs(nauE_XKuRMzz90B*MOpkiSb5g6h`NT9vxl$pMRcLC@(&a3_FVs9_n+ zh-H*82X*mb+KaDRjpy_x^4buj`(6OGng61J>_%E!VvauTZ>hCyAQ1Vf2ID~Tec&0- z9jQ1OTzAjKMuT4o7C)B;T)87#-UXB4CI-e-eibxI8cq??1dq!_e{L+>UpxHR>fL&{ zvt!w*gz~-Hc}>BU=3^ZHKQ!I^jx-J{02dY_8#%PrtsjKlOJmP%t76(1e+j}n=eYqo;I^`v#^3|;?;{pM1G4SKReX(N zV(87FXXdK^)`C0PY2UU`qp9V{?bkRhLK?WoxJ*}t&S*t%0~ds^HS*^t1Z-%j=?%=o zZsq(X12&h+npoxE%cB5kC_#_LiMl@(yHR0GWJIgx5*XYoezWBYPYQRiQ1dn>Zd(Oz zvjhQT`zii5ZyxRa`c8V+W=!@`9Q6tC7Bsu7D);I0nP2~qKpa&aV`Nh8Z}K8 ze3Ion5H?G6*tNdN&wbxN*W152Q*8O~`nYPW)=te~yHjQC?(d|lu18~ng~D_kE*bl) z7U56WPfl`&zH7d9=ZXCaEh8X69w@4CRDMv;pbz`De#okE9hIxvQ( zG=b33sb&&e_G%Ul&GGkYI{jb<^+Cm{1x0MJmb8#unbF~6GBRJ^XETmFdWvlfV% z#v|`%WRFeqTC>uU8pYUnBLeuv6gK-#57&R`_U*r)DV6Ar2mHTV|K>X*Pe&^rfm7Fn z)=0sp!q*9%+fQehdogAe;8#p}sgE^)dkHUlwRp6p59;eFb3}J+x z4g;_U+svj9t&1k+W!PdinSY9Ov(1l|V)+xGz5izr#q^J5g|AweNg>tq$%ka5C<$|C zNgt_RuWF$7?GKMDeXCOUvcg;)PvI+?E4otw#N?k1nhQmZmAei#KE=u1Gg7MBOHt1W zLB;h*=8Nd9vg!eOE6@4j{%PU#sHRh2)2piIS-?5GyA{Sv_U~VLkEplhc}4j{u!V?z zjBKAhU?cmHEXXUiUdt(2Da*w3(b53~XKQECGRhI=?p7%wDeYC#HlKs#kmDa#;xM)& z3{_(m?2%J4?rcQ7>5A&gP_}XPLTDfZ1s_`p2jK zwsIx=AyGUuIrs;tbSyVJX#d1@6i_#oE-&eE82V8)H}z~$`>VjnfM9@y&m)mx|6l&4 z*eAW?R~KOt#@&2J=Wtf1wUp^j*r(ZNoxOkP>g8{9vVFpjo{K2ASFvI(?p+KeH%Lv7 z%oa`8%ig8B_9|KdUp~P>^@$-#L-ZatzWbXT1D%FQu`>0c$&czg(pl+5Dgp+45un zp`ppXtaV_c%ftMmt7+}Jw^eNeuCf4VG|u%=YHBI5_>`nf0rm~bhfRlq`3BiP=UqrX zob#5FRdcVL=cf5G?Xz&1@GpB;j1?nO(Ytq^4LrezLs&VQ!A|;|hc@Cnz`^|!rzT@b zo;a~f+Z?iYE*5C4r1`eMd3Qn{ocNqIAYEvAdTfB4fb;C@HVLzvlwZr=Hu~JTzN9t7 z^jrbRw_3$?A1sB;aFH_We*do*ARK|}2)KBn`vVv~W#AplTM`3J4q@`W#dIJ(tQVd` zWlkMNL${={ls=9yO)f^B3K4p%z+Agoj4cKbTr>^k0a}s6W;KV){{K6OTR$t&D2e@H z=*w7wLrSYzX=_y=+#R5Lbr=&rZOAT;=qN11AnjBL;bt(~?D_W+0F3xFB2Hk$9yja; ze{sR-sg5b7benQEpZXcb=2(&{$DR7PTw!I*Ur+&Av`MlN5F_6!HG8jDUwPTKIRhxL z`eE$FM9zcA4QH)~0XSC?Vq3xoNO6BJ=qqczl9PE4OoY~J_lhfqk2!G&y1v7k^htFe zeFpRmCiR-=?vGlYV$tyF-+caC?N14+^}qP8!NMX<{FC8*^5nv;_ zRJQvP=AMJAH!#hMyzjsf?bx640SP~@2EE$o zXKzslb7o0|eA{B>0>R5cEmhVJ?3$S6D5dGMF;c4C4TtspWWQj~=hB()q>+gkqviI-IYSe`abMCiStMLf`>A;6VDotIjW65tE zVA6E2(?w7bTA|Czy)?^Fu|=uJ@u z-e9X>s3o;f6anQQlSIoEJj@NV8@{R?9(_m=P+J2M)vrCTV!yWqbPflyD9Hn3r=A#qlmtYqHq{k= zVjj$@-7YytsdIr+FYNYkKN_hs0skZEG zR1hR4M*IdH$bS%~cCnc#{gXO@&p>Pb7ZvSjO4&M$!bZEUnBZ@NUy8PF4eZkoHIWM# zla+`$+^gF+3A{Qi%+U4nD&IupMPO-$eu16*DBDFhg;+(G$4OtVUO~6$%q6_!d3*3V z_<5t@Wml>z6-mezy@NxqptETGoG4DK_m5b%kLb}9t7wCz)myuo1fRiE2D0Dc+$>#L zmSL0f$YE0ozp-z=$r&?c#nxJx!4bPx0p_wf@BzSst9Z23wD~)TQc$u0f}2tV6h#!P z>nS7aEA;_vEfkPw+mD++-ehR z>jKD!8^0f@@fex)_RGyV)q@BNR7gPdr&zxbZsiSsS~L@32XGj`;5cVy!*VZC{#YX! zC-{z1j7kXPF41#-R==>hR>aC_=b(9E?k>^yk(G4(^5=(UO!lDpruhjwM%IEf_TZcW&D%vi z*BK_Vxhqm#lIXRJ2vH*)D17y!#zX)GUko~0) zFq)|k$lyDc&o{b#Ay0J~(d+c{`mIFn$T12r57kHeHL07;l9r9SyQ8=uSiL+%Y4-=w zbb4S6^XA4v-KB8BTiKTMCa@KZ+R}}57A;R z#0Zor1Q`Y__QWGj{jvHnjHjlmuCmvyoMig>7iBq6RaX^^P)f$?Tc{MUNF8UOWB$XU zx2jDjj<$2(lq#RG&APCta<{C0@nuuv+LVCZuKxaPVD-xf2mlNs#hHQDOzZu0oW&*E z=i_J?-_wW-7+%~|+dm9Wy;d*avs5)7`09A0K9j=4jYB}EX8hs4Z@*kil@rQ?HYTTv z7A|*Mno*rsAs#5v2z=_0i>P!5fE2sy-TF>z1XK@C12qm|YP{4vhyK)4=~Hn zlUwdj`_J0+;ilpgeF%Ykq@ZJfOveCl5NZm7xZV;%@d1bB{KxUwQ;;$^NG>r4ztVvm z%C5V&J(!eRX#pM&9{*1Ff}*|_v>k@&5Sf3nIBSu1aLP-!0-8}K1M;@4<9{4YL6%}G z9Knw8E2kj=*_-cyNkhbIh?lkS*2K~E^&hoyV86CN8g7pk?*$jKreOP;I@R8`@H`^i7+=t_w zKP+b>)B=3A@uq+;WD*%GlId!x5Oj2&B1CujFPDx%K-Wus2%iT=NryWV@_vS4vB1Q~5CFTaw_LxI0W2T-^HomFc^d=E0DP>!x1g3TND1GNmLvT*4IXVZpae6cc77x8ym}Jk}&&@M8z6OR}ssqmUzD?a0 zr8UrLu9vv{q(XwIA@dENn0He?I^5X>EWW&&i<23)D-{~<_qr3KVw}yT1AtIS?sxu^ z3yvzJ_sb2^^w5*9zB6`}BIzPfCE^V%`T2cjlN1!_W-|bNV#=Sp0bs{?pW5|Y%|Xqe zP*4QCtH%xW8Dic(C(|DAl}4421C5agN-T*_7~W{sA6srW&*wspHt`xL0_h|&j0V&~ zM%?&esP0E(v&b3TYCg2_;@UOzY%&a8;?R@Fhr7bRR`9(Z%VPlyB|BQ5OeK&_WG)!F z!&J&MATZ3bc}x?pViYIy1tL{1S*su@k5isp6oCrG6lo!dPJ=)T2Q@euBuX6)N=n+W zw{`lqseA|0y*vM>#$tKst`F539|CY<`K|AOQNYU{VPNi@z-EGLDE{g#uQIAb)EzL127F`De^U-JbY8}mUyRGUQlhwmOuk4 zXdpirDj^31Z(@(?(rSwld7vqm`?G3DG@$KKk|_`p=BN7>>q#aTw&rYB9ALdMj!T++ zx#dec(s5#IWkDpKpzT^s%c?`m@OWp~E3X)}g27Jkl*miR%A`>KA#EszgXAp-pdjiA zfg5J&EUGpn94>TDeA+Xl9UVtslkHcxCy1(kBasJJa3PuZwX+s0a9N_!W-3+<{K!lK z$P)d@2@Qhx8Uc$s&9vn*SP927Ce~NTLW1q@-fb-Ymdw_W3N&;3#SdR>WRGV)rZmNt z#M9bB*U+_^G;wKQR`sYRH<7NNb2Zez01g)oyPS9Kn!?b4MUN!~;Y!)}&Mj)*Ul;Wm z1r3IdE_4!hN9!CjXS?SpIo81<)Rj@ni6^o46bQAopI6@@Ddc2j-e1X{y5X*m~YayM>h$0|}qwwU4f>h@~JE7nZ1dfW(w8vI9dw^1) zGn28Z^o58;D;+Nhd@tDd1>}fejPN+t1(4|ITMnY-XU`)pAp?XtG0IQ>7`i)rWjjcb zg+U_n%h0$eCG3?{asR@^`ST>M6Ql!EFwP#i#|yijmn&Zg&*9gIM9nu_WJrM>=qT>n zg0B(Y8Pf~CxzIT ze@a^X>mI5}(xn57c$4|M6D#<p^Z~=gw%M)&U*Uty69WWhT^q#}I&!=?N^} znp9NKxs21(X*(vaC~L$bO35flwfI`YR=52J6rLmM)kXnQqNm?44$eRS0Icc#y=MJc zZEnkTdQsZx1sI`M%(hpWkZ+fO1qJk7 z{$1@GZ%Gs6Im9o_j` zHdZ=%FnK9K`gqR(@__Ub0^lX5s4j6xs))ccB$vZNpx_{V8a0@cdVtR%lI#dQv_H8m zdF1XEGRvISwFRIQy0%*)Pat!A)RU|&Z)BEX&W zigc<@lMvLBOhj*YXgb5XYm2W5;WK3idKkOiaiANHS+Sc9e7fm(rGCw46ul>JvBgwG zl|1LfGlb*kDr4^8>0mBFx$2QuGqk4EBi;C4;crELG7F7F3%Y?IE}(_x5Zz!J%KI z-m9V$<$vQ1&G}l38hghKjb~xW_Z#?(j59uhtjbHt+_IuNjao44Wdl#wLl(KjKi3a# z2`%QKSvk4PO@~Bx7g2ljm1R+lQ_1V}h0OzhugJ@q`Wt?&f|-Jc@45l=x48GvG{W119?b9|V!+}Ppq zh3jcmWcxiOzUJIMYqubM^5lV|=9gvNTeF&uOE`lFuz;nW7wcDJxe6^Hlm%5Qp7T3p z?aE@j>=)Yly+`-h#bs`|S;c94CWZ$PNS*<6{%m%v7r8!`%WeM+?H)7YfTSRAp8y0r zlND_`2RzpR(`t+7NvO~Btt&ygYQQ2qU(S$9am|vY5!7I$g^=EHkF=4)3fL)zES?-G zxd==DfTh6kU=Jf3gh&VsB8M=f%1_0BM+eU$PG^R4{V_IW5ARGD?fV=H#RI(r5!GRN zgFw2#OL4!2aNhnl=l#r9=>k|g-)l5OS^*C^{gpNmVa_$)B{y6U(?d3ZB6LF$aFXja zTRe|KT44|r_~7}=^rXiZ-s0_uK92rK=Sc_ipl(EjJWWGG=Y|^?!0#@=D&4ojZ|3!% ziS8Bgr^S>&4Do%j0pIlPrHk7@!G<0=`G7D{p{@0@e8}|tTr`y!%J$+m;^y9RhBJ(< zhv30SK0%~cqpo<4yVc_sGmp^@8$A*Cm&-2G`n$JO!D>4D= z@Id*~?KH)Wk>6%2UcCkC&?}2H`2adaC8EHkf{0PQmY;QqwdSDr+^yvaj!JG_q9Yc< zP}5*RuKJ>>)f&h?@as{bNGEU?EsaM98s9fqvmn3a~_6`r7)x+9l5g3Z(55K zZ(-E!78Fk0MeEBB7{xOSkb*6i@&R-&+wn`V?UwLbnWWqx(17!k4>C|qkf`2g5?CklQ*S|lGNX?xh^ z{Ao*D6Bq?@C!Z1o5_W5rM;BNR!w3m{)wk7JtKS2STUdgnDnT?IWfB1c_KpPpP*6*?;vkULrt_o zR9|7pXmx=yKsl8NCNI#Bha8BZZ@QzT8~_|Oy1MyC^t@hwY?YUiTM%)AE_`uT5cj?z z$i3CsS8KnWCFctZfdC4^Tj`%j-?-EHFn~LHW)1_K9V!?}vSW{c-T|FX7YwoMYXlhi zXKm)<3rkI2%s04MP^fO(0Oa$x(Lv({3U2zgHSbod8S7|Sp;1H>Ta*x)4CJRaCX0xY z05Hp8-Bfg^haS$%BTL{3Hy3wU7w1my_^bPZO%tdwzIeK=snT&nY)V_ubx3t-t9cyA zcaR#ym{SXo^~PJp?>_v9^>gs%5K(<4*%3=cq~fSN>XF^3_RD zGZ8tR-9jDPlH+@8_2|E$l&n}?At$l)2{06>+rae+pvqePnnj7tK_PFed}pS}GPTOi zI5RlYbUtlpHc@GX=e8TIAnfJ4-sK1jKz5Wr+ZxOA_3^mtXWxV>g*1xU##i=Xkm|0^ z1Ex#5t&3?9bXSDa)Az4Otd(?>KymsujYJ;o0vLbK*xvO8H_|q;qvCeJ>1S|04>b5e z#r7@Tc2|AYt($Qn9s6*I>2}IDPKNa+@AI9H7tQ9_VB%s4_c=a5BH+a_e(VtbR)>D1 z1v&1iY_sf^z0MXU8Q$DU>v|bZIED!$TVea4Id>SG`=2%M@^dHKGIB$3-D84m*!Noj zPjxz2`}xy-bssut!KF~~yJ0EfWC?qlZyVl1`tP})L0_Y*x^h2JRQH!X$DrSNfo8F) zuOlmQILORdSS;YOz7;zN;--p6aDamG=5PS?4onPo0mp%PFEY@f1nG&d`X*Ndx~)r- zq{+Bm+fq+a@i`K)XfOsjfum=;FlZ8nw};cxmI>yNP_eklaEUHmoSiFmI3Qs(xosIy z&8@rHQVp|?Ab*tci!xzN6Nn)${nh{FP9ng;Z)`;qGSmWNMy+zp~Hd|NI+AfVK z;u*w~{P|T{shj^hKs1j~pL*$8y6pMMA!1UK)HeD`R`ju)ZR0}7wfbyH^ zJydN^_$b#sLT@4lqLlOO-A?>EB%+ZT4;u)=UUm$Y54*Jvc>M z#k!PCob($<@JG0MpSe*^4Y`dBH;5g?Isi501Wg(FcO1J^3LVx;vf@P)g3`Z&2ivDc zcMuLTg!|U)3F~#HlfKed`gd9gUGyz?Qwpt9X8_(Zi|g`*bhKbdhfETrdmjbnG5>2j&+C zk(nXo){dE18y3)$VW6%nrDKikIMHrHCdE!ams4QZEKWZs-a_xyJ&xrEl1>sO{$i2o zBq(O@TbFhvIK`p%%!Z!qoZ?8p9U(mB4o&5TO+d|;mZwNX+BH9vES>JK zrKJhT0m2UQaj}5wA7lSmly!j(N}#1IK#@^ z4bVa&S`&GLZG8QXuB2uAp{IgCyA{nVZxzo7lMd>(Fc z|FUgGzWMW!bpLXK8(EAr=jMLBDXG$l-Lcui6(yL;5ayk@9D zNge(anqGxmeJ0jF6sxo)E+FvQ1M%0u&Vp&ja~@r*q54*+^gRXIpCo29!iDN0L_XUd z2vv)%GGCT=(!Hkdp&;2lo7nUinU`_E;*$HPeZK!>ExgkG-9b6*D{pMAs{0{=ncbKb zXyW)IQD0*-f9GSagur+(GYx%jq{~VpGDdViYiE^xC2S=)*5S4SQyRf5aLpuoFIRzv z2IfmVj1R=`Z?#&j39T!GkeKZt>N4d4U*Ev-Kn$()j?}hi*_fG9Na7qA(!^>kkw!|R zY;db{c5PzFcbe@sNW#}lKEfh+PoZYX&{N1`c`NdUbR-f7De_x5JBGrHwhm!e!Tk4D zL<~>DZI;g$eCGPiQy)-za&RVa#IZ5%X39*)fC3=8Y=7bJA2n0Xph4htgz$w{Go?e!uj|G&3Km_1X;8+e0P1z8qL)vO}K z)qbf;(IAym64tQGRF|p7KxedOsZDEX(4^D#<>YI~;kaylwT*>9#k?An{|JBS6XJvs zJ9o9sy8d4w-GHvf5`kDMMkf-;?*TXWv=h-W`S^<)#0CKk25#3}$}PU+ZG~psEy02O zaLP70wq7AMl;s4IkM!xH>HHEF&ADv1YkAYkBq3sm(bO3CVqV)|l}|S~axxgsT)s&u9Eb~Lmmck&5nl4P4OvCy?+9@tk7ncArM&wC^owAW? z-SG(Nu=8ks_|@klfg|@JKd+;0;q4oR3UVaWm>bU_p}BuUe48ozIVutXJ4l!HY;7QkI#t%$#c{K-f$DXP2S@U>JFg2cK7p#Ccm2SX!8?T zAJ{dg#T*t8JoA0Lv2Y;*U2UG2FQENy)ZpN*@F;s#-{`U6?t9vfdA97aSHpNQG};@7 zLUxAWuoFVkm$3P0SY1d5vahdS-3a{t>I(};!eZli01@2X0+GfSwE?hTo1Ah}m2`35 z{5`AdIDbKx9427?@Qq#fn^&6e&vIssjS6FdR|Zb5cl!$o%mDSLi8REV0-4n}#98mAb$vr``?jiMmtQ6Mjo||g$&TyxABKie zXC)1h{86w};qy0NiNf92k&TCvKZewjo6O?k@R0D=G+h$ z^PDqDA!;PC<1=Frw|YYVp!Zl`40F1Y~Tk}1gQmo@hUP%(e2puXh|fCCNkM6pxrOluE^=#j?i3>_g?c@SiSu6rS*K7 zFh1K$h?x<{JbDoxHhFHA8F%4yFQ2UZ67?TGTLl{FvJox@FCQJs`?U+cp9!mW;H^_o)nViBIDF_aO^qG`vSq(s`;Fi)GMB zNtEB0yxgqkjA3z`d@LckFT2VsBnM_Itlw<;o@(s)ukB2o3wd4s`Khq-l})f-vslk+ zWD#dJCCP*Lz3Umr)%J~qs8~e})?`qG6q;s9yUh%TWJ6EQ10JQYccM-k#8glG8Y_pqJDA7`lmW0k*S>%?Gi=Rv&%-QUZRi znoO|Uz~)6C4QxqjCt0AN)LFudV%UcD%t_d9@h0E5?UgRntxc?}{N3|Q`y1lCtJml* zIxWGUd%wDS^un0|v$-hAsFjS}B8kaBHw_o>bjLwRDCK!;7%T`SCF7n_F>t$f0*v%Gi?eI2GWPaP< zIeJ0D0flP|iUJ1NTy&vwT-rVH(rRS(WbUy*XhLV+7vPxwSxB(;AL7;788;KXRGKF> zpj`(0PqW|-2@)skYJ?I!B}g`hI1r(sSx)4=xV<|1)fGxs=;Of|(wdA^%Wl~G=MnkD z{>52;c0wAYu~yoxrcS8dC1zc^Xhb$FRScO#X-$Q6cl9t-Zn@Zls~9t~4B~ux zG*gD~T*&llXJ=%rW+cm!>3PK#H+YjS!R*gX*l02#4*N^EIr-M(cS6WXEo6FkT9fFj zUW4mblQQF2nuq`dJYf+DE&JjVC3y|#@y0>dT zq*IJF#b}si@Z+CuttvfsF%SL7$}%=^_5M!{_24}L)PNsDR7Xc*?)#TViJeF~^ScI^ zuk75tIc@3ueVDvNVgvxn4$E;BV#6RlxpSv&C7mjjm`mo>mn*_38yEUHhN)ftHuBYD zt_aL$FT#njE-Tk_23@1{ge|{Ix%uTE%Jn3b?Jx?AKSJ`qBx#bqLp+MbeOqA{{H=ZH zDTbbel6NlsXT1IJUj)$NGoTje6Ijl-)n>NbtoRG+%Li#N@P>|Pg6rQUG~KPVA#9SS zi2Bt3L=9ow!C!8P#pn)z-dJp$2TiX^A+;REV z=&1X#PSpG_ac5G#)+0_`X%ws1T%?;i{GBZPiONnCILVrMS&oAq5FEpet}6ttIc#Nn zlkM+vS~QQ-g50(nJ5*HLh;EQ!x%>S-)yGo#&S3tVA3R^xas^5mKdjYekaS%r zitH_c7xBG-g>Bn%@5`;aTLTl{Iys0le*N^A@c*Ke(B)QzW*VUdl9cbZt*bm z{Zv>U#a39BcCf@%Ey&(SJ<3iByiU;bVP-HvXlDMmS_9$EVvz{?%gBtFE{ehABjp|^ zprjM1&7*j3eqm#=0zPpn^o?u?Vh3PX6pCfj{|Q7D^DNma-h>IC`UOd$M2GS53tgz*qG6m!ZM?L z*8cw3S!}2Q*&}>VBK7r0!Fj(pZ!^a;qm$c|IoVq5WKk28u6uXig=xqM{>RRn`AKUg zwv36zx__;DER`Hd-LDT0suda79S)6Qprv0s-|RoWJcetI%5I67bpNdTEMew$w?AiQ zt6Ba<^i6sj4BIDT`O!Px4^un*`Au&d4L!Niv_D24j{hk+ob;tyt4f9FEmj)Bd?ej! zm8Xk@sG*r%hIlh`_;P6yT| z07H)22y^Z_M~B7%b?#L-oQ=Aa2z73e{(uVn$0l zf!x&)1HJvH-F)f82(^@yY~dTSmiH+fIbLov73q~|I!d28`~jEA0u0|W;gvq>{t(V& zS~4e+tHtL(RT(T6vFFn2sv*+JSd+t29kBTO-!6y#DN~Qzng&&lU$Y zKSdTbHFaOYknI|vkQDU9TwlBMrU5QswaqXmW#=DGMT-q?-6160>R&>bJ?;)_CiVG3 zShZ_EQ(I5ufd{pRh>6Uxznu0mpnEEEzX!jwCoBnGsK(;#{?#$HND=m_pta6wyj#%x z8|*fA)MSC4Gr+FO$8KS9vW8sIrhfUXEO17P_U~U%bv3#pGu6>{Tp}-Qp&;!~lMB20 ziCf$klu9AtR2I6_fyr<06Ch-|x>0Q~m+iqir}%Q~mux7=IWMZrZ%(h>C*G(ZU-JE! zdBJc0+Ifh)uEz|I_pBKoSXxSMqIdx4wX>{l!vm|)97nQ^{#3@$S7Z~uwzh_7(+TZ~ zk5RokiI6QwLw=IYi~r&3t;3>R+wNgN5Ezk=7LX3_VIS7Q8>)~T3P)IV~C(>T*^6nN55 zAYPOLelUE-SX`p-W(4^5xs%M}H94Wv`Qr1i*69J395OO8x}YGDo$-EVyK_Fi9cvVO zG6x3%J{zQ-kcVe!0dEtOk#BDe7Rn)%cF(_nOM}=dlb4@!^HlG^6BWJqrWg{Np~(+IB?|G z>Hf2zW&-soFh1%bf~6gh@Z_enUQf1N-lx3jym+yvTTap9bxgYu-q4oq>g1vQG?<&W zKkJ3G0|Cb_nl0JH=3>0w9Z#b&TASxt?)?!%cLJ~_iU>Iw!EsOI!V*wDA{rHKr{6jO zcO~_3e)8`*l$d~0gNYr2(*sWULcc?v5s^5dHa|Zkw8ZURtL*2=^W5BY3=H={hDHZa z!sVup1#P&h?ea;zwpWl)(y{LqnDas1bhS7?T%zO?;tX{8Ny7T#r7S4FoVGiyOqHrP z(>;|%m96>oO-!g3fd;P1N+t>S&B-Q@^)S<@iniyKR*?=@Jn(z&XXbSkaq_Dy}fd8=YC-R{{4GL86qny`*9mn zjjGncSl~08?IV~s2lt~mt)kYLp6HeR5<{+wmIg2Ky*=EFSU-CGn$&x3VcEW#>YD@Msc`rug7af|V{rM`&;3nqrYx)*N|Byl+YAMCcCHJy-Y%>u76X zV0Y_5P%Y}7QS(FbqS>^SvEQs3geXWu#k8}3uM#=$samAPckHp zu{EiwNC9X57E^3O+4W{eF79J@as8VJLbia-^w&X;H7AH@2Ha(5|DIyVYOQ7jAQ{Vw zIr2NpReo-|x=V}8W_17V_Xo+ibs(aEnJDPswNU2vqLr(hs%osW8qb{h&D9yK&V`Xg z6hX$jZjVd1nFF3w5}RGi9CR-IICo9E;QX$fe4|6Wt z&f+}Db<-T?4}XA(JY>Gg;`Rr{V$k;tli$~_fGJlhL(F>=DE1BNU0)J@&Ynoh^Z;^( z0w(%3j2~t$^zxroC6Y?)y;m`@C_+h>!Q0i8X2#4Bv??`fdgTh)tOE5pp=i5SiGd)Y zwiJ{Pu9~Zop-fopb92?7WLW)HS5qKvmsXAu)!OJB45T~;R-^(k-o$j_aD~!;%8U$@ z57gAPJoe;fBdPv|N!(@fht)OJArlWcGJDA!TEn68Brp`c8ZE&;2j16@VbA;R$I8nS z%iY<2|G8u&!lEc3U(gc5djUOM>Zup?RGl7pQ2N8h)GP93noyXaw;m~>u6E|$@tVw# zvHP$m+NnR>3#ToS-e`2=nw+R?Blw-$u(WL9`3vx;0kB4ok^Ns;<6Y5(NxS#YzmwdICCXkgk z`QV$V7r-P-N&?NAzwi&mV+*lU5d%Tl=Bf@-G=sM{;j9&l_`nz#YP^oy=N653r(1c?X$)wt#q^;md-wXp8&G8fv9 zJh5qG)0@Qaqy%ll`EOAgCZ2ZVSU{`o+LCah8Y=SLJa=+i2=?rS^X}=Lz2{QWf_B2u zVM2!rc&J$;(qmNX7e*^fU;xeFM2;S8 zt_B&3@ZLx4pQtY4wT;wT6P0a>$M!qtZMVbTpA7LI1HaAc$%wdi2@{~70y{$@I|N77 z2p>N#=+0(I;i7f5#4p1k7f>BK2Gi*vVH5scZO2lNqT*xN{>&#>bm^$5YGK< z$MW|3F$04K;k4TAJ8|!rPF(c^uRl@%MuN%(P(K5+)PETpKnPH8RK1CJ4zF|Hhty0L z52>?1^jey)!yi^kX}4PtXodYztu%Pu|7{phSE-Xg{GdHXiQ0=W*4FPQNI$`OA`(XF zb0KLbi!PA1?BnAD;Ut@b;TX;hVsq#{TSMuOq&H0$fa?z~S|RrrJzmnUOk4+nGOwtW z+E0-0-Onqs_Ze++-+zovF5s8pdrL~pVe|^yBj6k;pGoVnW>x;e053JFY)8yHaB+(nzGJ%09DKV1H;17>{N(!tZ0&d%%!nxjk zmAsd2s`;f|JP6*Obg<8c#(c8Q%4|5t^Q}66rYEY+63i(F*y|q}shG6e|Nn{}+J7%m(8&OaY zedV#rlT}uSanpzAkR#VB(UT2X@Iomolr@3>NfqI4{&(GPNr}FyeOTx`AId)rXYdS`H zYC)mLe9{r^-oxYsZ^r4?-#yopKtfsc(%9eTftFB_YE%a5hnKhY~V|4c{R zDmpf0*MB;dF_f@)_T)qtBFe5yy>FSjzpd|%WOtNlo0y;X&Qf@AAuX{j1hn9!NmiDy zjk>9;2oN}PdMG}0C)O(`=^~2=>@uOYlvSg8fY=<{|7?zK2xoQBY|>!g?rM zKYXq+5ocIGrR8Gvfo5hLwwuiN$$Sl2!8**{ZGsty?t0gI3#`9TOEE`6-GWXkS4DPm zM7*O}<{<$YKQ?xiE$vep6w?S|TC;2STGyQ*vpB|%{!ylqJ0y)GSUEFxrw(k$tx|5i z;;^{`;PYkmJlr|{*8wxYzbEN19bSw71iL$pb7X!9HbY8DNvEUx6t+KJo*k|d2P$I= zw1&&1ltWFAwtBT}8Ro3cj*gDEM7>!_k)~^&I5H(pKJT_!`N^oUQZZ=XxF0O!E8>6- zLK_fAI|0Cl<+*lUiwH()xRPI|8h&X46%q^tEx?+#g~Ww>{Hq!zG$a$sPXwYLYKTeq zg)inU)E0nv0A=~^G_o3}gBqT&sBpDWiaIyK?`0bYhYLJ5byQ60aEv`ZgEp@v z9v0Y8#BwD23f!jXvdt_OJH%+RFF2n2D(c;8Dr5G_ca@BioiB0&M#zhXa*QHrbENRs za-kv|RHH+NAfj%4i^D)*(`f`;y%Y}~y2~n^bbJ6#4a)qt)dDvIFTMC)^P16QQO=Lg zP2V4Cs8&5@kQTv;;TRK|HmMmIPJ_VymhKZQRAuw0$?`GahpKWCgaJA$ILqq`dvLKo zrJRTM!h!g0>#2&kG&MT<=i^^e>*_hGg;Ho|ReG}(j%AfRbtWsqP?}#`Y8QL6=D(VA zItB^D!opN3t+r#$I{QK2hRONEKNOcmw#340Wv!@H4L-6Q(QCmdfroKRi2vkc&_P=!hn zw4D-#{C(#v<2jnojSr$oha2xjLer0x*@#N5-eR-}2@2+?+M{eXMl(_xlT9q$ieq@4PiDgP~`~ zOCRtVCy&#fy?z8*E;wWaqRd+XNFP{NH2RSMtmSr%uZEM<|2>M;J zc*hqSBB#l?ORvjlnp)bKOiy>Dqq&oL*M7UFMx~F&9RS~CH|m*bz@|PuXnK3j?|zS{ zuK4bi^y$toujCRD5s{T`-38czrSIQE0ZWUD(0wGWFx-%I{KCq&XQd;t(U~4B3Vuj! zL;dQkklSefI~Ng+LZUq-)aG{&pskbnOrC#gsVBa8W4)ph6MGUt#HNm&0ci6=n&d~q zW|@h9p{ocNq)f-M_uEUyJZ+pOE}QLhVREsfc(k#O*L;oLk>vt6k5bd{MjL8c?IVPp zV;E9N_pHiaH#26s2o?SY{`{xfyyk<7aF4MKQ1JGE))!%zo12>^`w2pAW(JF#GI81? zWYJ>M88|QVy}PM-uTlwupRf!EB0rdq4FS0G)MTXlCYoF@;CQWP4RFJ+zqIY^$_B@# zs!4b)bsh`cr&tfV(G#DG4%%fEIoRIL&w@Y0$Vd|#?6qS145A;iU(`paxtci4R!DdC za|XPq;WGAIX%Fs?8oFdNZZ?Rz157oO_HU3m@;M@9q`d6n?)F^#>;l1xgnxJOl{zHl zv04U9spd5z4h4)kL}$$%u`O#DjeL2*^-EP5>ihOup@+k4zm||`&j__?^E4D1ul^=Q zpZK2R#r%?^*P&XcbWheqUWv_Ov1PL2VJdcsrF%rBaBhF!?<~Jthy`lHh;jXgL*I*g z(2nujm6;apo0^}5!}YxV*`7Vt6MJ-J`o0e>6%gez1)7R0toJ`!fJZ3zruI9#>pL!1 zR^O=NvM)B~SBPKooQQIu-CzB4wks#6`04@G>64|(TxajyYZlK5bo;AQy)~8 zG^$nD)%KXyt3_hw6cO*Q%0Rb2U7RxJ_zKFqcMjspyFoI`I1*IKEn0y=qE|VAZ>izp z;$j)m06^o_#R-l&u`1y|`F=^l(<|t}2g1|V!`xk6Q6mhMQw7wd>$yq-p>@th=8Jff)El*ge2v@!HybuumwWJ~JztOkPXZzjnT>{%xx1e^x#w!Au&BOfhEX`) z-q(uJ!_K8qCb?MAk6CwKYJU6;qO*uo7kV2JlIeO@ai3DfAAVnfQ8*msJqr~V!+QmK zGKB0Yi6!=Q8M!@y?OvsLgizmp{D;g-PY~( z9B&c4>cKn?h58snM99PEO87vv2wn8QUZ_()@7$`SvZgq4x2R9uvRV!0@L~GQfT;Cb z<4PnzHfF8p8vL$qs^X3kIs*Q!1r=JL4#UEuw4bziXe6Bso(GgKG70gqg48BdfNe3lClBIplk2P#T=;PqKXcgcu@vDG&&=f}av)hR=otg;!)g1Dcc+`9&({nF>aKLEP)n<6Benvrq1sW)T!AW~GN zkS5K5f941{dRKr;vvd)Rr$=#)M+21QC)tO`zqU#| zzI(E0c8 zX}~FpGt&L6-97{Dfr+gIb<*jY5d@=k8SQ~;p_%j`sQ@m!P_VBKlI_&*{fn&>ha<)P zIUP<{CK4wz#RrLE6h^i)@^F0f7tc(LeRSqpe8sn3AVBj}DEB{2K#^?-T~9ZS*QN8q z-Rc9K+1dQwPv7nq5}bVRiVYS^Zm!zgMyYg2F@t)=adDCOjg2GXL{?k2LVG82SE{U{ zx`qrHQnUQNmZe-jYJWt)b3ahEMlp$-;bVMj0#f4aq^l))KLN+!()^O^R zrc{Lst#o-xnF>{rYJOTw8$1^oT({aoZ#xFI?A%nqOw#UIiqFBr2Rp6EA%`BeqnZBU zTw~o-tc8lx8&qz1IfKxh+D1Cilmo0bCd%;~)n|8XiA1VF* zEMQmPp{>8yj57#sRB6>VHZWk5w6*-C!jZb*%hT+I`Cv*bvNse>p%0OEVK%0i2? z#9xIeR=gB#fjkolHvj+-{ko-XRO`<`wIfW3YW)RjxVVyKjc*MV`g+lmPO4MQvO#8$ z`D=CT1jc>-fR3InYfxj|aaWz{;C|5Z&}E?7T9V$IPS}=?f#^F@aqnszYhU50WVSae zQL{GyojYoO0(*4i{j4GZr<8xB$$4Oy4Jv9d4ZGN4So+y~y!Lx|xIp#~sC>wmX}U^% z@f!;k@M4fwOAVj*etwFz@_YqQO9%G3)plyGKjqe+@O;Ocjz;*x)gL zX8mv%MgDPnOYo|9utFjdnCt4kOmGc@s1MPygvBC}>h$ zMDZ_QjVm}7sdqVlSzgYBC{$wZRi}1*@%f(AW-pU6rmN)FPl6Yq0`K13zO{7c&B(=1 z;hZLj^5z*x_DeX5r%H|*wie1J<&c89%#;{v5@4pyQ2PGil9G~o)rqmDyj9OIsJv8S zP?u%Gd1ytexKY1FZv6t7ozoU|r@oq=lJ(TOZhzVdon1=yOg%A=7Jx6mMX+AjuJuj4 zu%RB(^L8dYs{s7u$|hi}f-p=}Dd!6E)Xfs})L~W#a@4oPP>PpS^kBkCm85dd$s8I# z?i6g2W87i(C2@(lY$)d7S2kv6XU7|XTD(cG@XPhkai7kXhxKO&W$r*qPuz7Gj#XNv zDxo9Rt}Fvo1~c7`&fVqQD6W}jvc zK5lYXtoi7zN>Xr^$jHSI62^EPhVq^{7H>}sdMwnu=!QWJJ=~s+SV@_M>10b5%WqB5 z??>de+IR5Ai%R=5zC_+|UGl#3Mo>@-)`Nag3B(7H6c2K>id9|rbg~^n8TQ=0Z`^8M z$$K)UP(QwNxe@w9#40iSh`!9V6N6ytgA8UJ6zA^x%GT@MGEzFT@sk0Wn2?bUDWQ-^ z`m%Z4DGSPfUbKHerbK7R!?2qZm4W{K60L_bdR6LU<^T(yvkqdo$K~UNJ*uyuqA}&G zEF{%#mqF*1JO&W~ukfUWLiK;xe1u3Q-3{L0PzVM6-k&ebi%BIAp7dk$w+9`J&jU%0 zNq%3(@1M`@1@i`db|4L%kk)g0Pj03sw7T5+d*zd*Ia#lP#k9A$GRA!SOy#cK=w32 zz@1w|FQ8^89zynDC8T*C*Tf$xp zcl0I9o)0bwc=?-XC2!ID%0%813RHH( z`pm5`JXN$ZnHZU`Rzs?9+fj7{1xV=co7T7td~vj$9nbTrJ?To}=d!u=a+?`cP)Y_} zu(zo2mOp320SAlHq#PWqUb}41N{kF0a%E-kCrqHaX*ifN91=nZf#~1&&nn>`RH%(N zn)3jG$FQK#syAT*9=PSVw_&jI=Yu{(j?tEUU4yA`);DsQ#g+=zKg1tA9gkA1DZQ_R z*?+z_n@v81+JQPAiQf0}51OeUd~hKKSF47rBCzbs%||eHH+lFi(fgm;GQpeyflidJ zw)j+)I8t^pUXJNOvx9BZ;UbS1@W0^iRXVj;4JGGi#klQEsvWo+UKA51TKGV(x-YrJ zQ(UYKF#UP2sW~?mFe2*se0_bYVO-I`P{>-1neN^0Yn}D7HGB!)WZGQju>W!=WToN28Y=;hFdxIdED4%8xk#GuFUlMpo5rj2( z-~rA)v?}nP2ob9v=#;6(M3G~H;#&3XIGxV=vXX^NMbabpN{O7M_F%|+>q(tNXT1!u zJfrHOFYj`RJ9cDiPws;0~u8^Wr`k(2|V!Dpvs`C1$IUy z{)(V$F8d&5%K(#Zs7(|X8SA)Q?a*c|9K=JHn|9nMWXxJOKXF1i2EE#XqBn-2&o&ih z`BsjADW4~m;j7@vkNY_z8RDe|6Tlr+IG2v?jIuIgq?R-7#o!7R%9N>4saU^zFQ61E zye8*CEmd>Gc)gZNGL%(_9K*IL{O*^<`s&C_d{~+?8(M=ldxvqvQtyXot4h25FZe$L zlV`~vCdo;@k}!cviWF2ZsAVp*#FA4)HAE0mJNIA{P>4PbZh&fcJzcxIA+C{)4tn17 z-Z6u_Ys*Z#E%1KZS9!uXfre5vG+IYd1F-nCn*h7yzU1w~cy26VI^4RN$AH!S&dthY*>dL2tE zZq1&QqkIue@@qXv&7(&&#}NuPf){8QjoXRUXGeZ6aSVRa z!Ho*FQ+miPiS9qm#M$G^TSKY9N9N3y#Hc3i%W`m%(O}OwN_n>i+Cg8#3Hpl0f|aZV z*JKO~4V47Z0If16PtNi7kBbOAy2?LN1);4gsK;YW&{M0Xt%fn zDAN8Y`ABK-b6vlkK~mTk_B6dP?x(C+AJMT}`=^PbACk4(bz;Cs@g8KE)HF4{3^xeA z(N=Wn9M2Jeo-vC(=7kKUzh@pQ$BYSUZHyu-UmK-vS=7PP$l`Rm%+R6>w` z&6Atd*w~dHB-vRv3Sm)b%v!JJo$L{E*`)_t)8}Ey=kDN4y1v*NW)~452a#)C)pqlr zPg1+sl7y8L&Oy(zG|OyjZ#pWzZ)6H{PdT^QJ?9AMf-c_~U_EXCeITa#8DmXaD@`*5 zRx=Sodvqrnb8*N=yKN+pm7T3PQ~u%oYO$z0yR^x*D-VqCHF9CQwBlr?D`<0~py?vB zb$2%F@z#!{KLj0|>@Q>@j#T80O-ifxBf_G)zs0j?a`kt0gk~#c%ft-rA0_dSfapRG z&?Xz)S)x>qj=n!~Hy2C&>q)=KnZ>^~r++{eQc_qrWVW2v^5x5yau^Wq*^~!`ZSFl7NNAp!r?qfy!Vuf%N#vGx2S9Gu~t{pvVg_2 zw{NC+BA)iqApmnD;iyTCVUGM!qA>9^JG+b$>~ongpO}$gP0K-4cO{P;=?7>H4=m%JCr+&b@w?lWrS-pc7D@>cb#1`7zkfcq-98sQXE(qG z?Z@^b>)TS2UyuvZPbce`d&6~B6`?*ruQDu=t8i~o!X`}}<6uTloN&m5MJCas^bRjL zFnbxaQX69v+qgbHC16FPxpsU0>&A1Vg^7hlAUf^-{P&UMI|IuQ@J?e7A)SVeVe^sf zPcgX2G>XF-H_k7ZX z+NQhj2Q7Hr7E=0lZe;ib#N;x5i6-%%!*OuovoS$J2B6Y=nw4;qMhVeA{im7$A5>66 zo0ckK6xx9D>#3+C_v3ze+qZK@d9dq9$qzKNv9m_YW-qKyEFuI*w>G{P3j55m!XBnj zzl$SsnzZ=z1eCqKWTb!Wq8W zaCKgO_J^mLw9egkqHjXhZ|ETgs;@aOTg*E zU=WI}shc_ezqf(Fa?aZR0Z0uF3$^}0Ae|=_70vKoCApzMoV4k|-te-3EWPwvje;=Y z3#$jVGozVu_|OyrxdRJBrjE)FYh}`mxn5E7;bYfDv_IMFdyJ7U$>JKal`9YANVt)Q z(}em$pEYTIekhH6L)eS$&IrtfixK;Hy1AzET!gpg3d$RI`FC{(^ZuS3O;{Tyz?cbMu8-9ZJW@Kiqg- zjOHxs5BmMoZm*hT!V1*L`hrtqmk&PBp`*jl(b}1XAk*dLj-q7`YS6J{E7C+EX@dS{ z^EEWzy`+P#-JfSEm=cINtJXD}Gry9o$fjLY@L;*HBYuxQeSXj;B!(^Bat-0bJ4bWh z95(%81e%nsa%d=XRe>|TAIsdJ{1v&4OQILqG0O{04Xp=_2}UwN$ocQC^k-D{4?tsn757#A!pd12><54_cN2MR!jb#D7qldpGILRY z76oo^Ht_j*q60tS>(x3ZUUK$74_Ov{94v#ci}lug3aiVaJ^b23eIyNDQofk7!{`|x zUa9vb-g7Y0Fr+7PdMPChg0hP@8@sxE6Sq7r7F_ciq_23ekR;F9*cj1{6KRjuGNquqJxOw zL2%tYl~OT!eyy|FH_e1#`H$2rwO1PmMx-<->iqBfzCc#cXdO3&dU=Tuu^WWB?M?$f zUuB$%Nlg+3Zc;R;o)U9Wz6Zejrl-U54Jf{ro@qKUxY`P*2_n}t&6~Q2i?|(wpLik? zs|#uli492S#WgMPzt#TU!>swj#wK3?lux)m!8>V zyG)p!9UGy~2LVZ8vz69sAQgXTdUnVgw9!z|(b1p1nZ)4ZRM+q--7@#jyh!XQ+EGyzBrrF%i6eKGhgq1o&%2+4ga!XhDC-++!Y z+*fkh%w`JR3w{79bhM-6zVpjo)c-78#QJ60LMUsaKYVCgsBO8xFjSJy-@FYgnsJzzY7xe2Ml>jwT@jI7@Q3kI~<%Rh=o}4lUIPYp%dDt35 z-2e7H@_-g%IZgWV)>;iF`hzfev%XLC9=Dg9PGH#CnNFSa`sdw-SNZC%vB!bff^e_8 zqC$Q6;QqtMI=>~KP0T*nv1ZBd_9+*L=kMktlzqY7!*qxt;v1wqR z&oxET$;#G}i-ooO{`pS61lGjv#lTy{I}}IL$rEm!(I%Y!og_>g7B%J2<25IL4-Q15 zhyoAvYngJ4H_h6CT%Em^z>h$hl`1v zrYHsm2I@GhC-q>=T6AnI?yEdit+J**kUBH6EvvX|=B`os`MTtC!fzB*KD1+irHF9BbcWeh7|O zmW&`i%IxD|a$TXUJy^nQSk{+weHsue7KpcL2$8OF@Q=8oEJ(}|8*Wd|Ia@2i?VwRr%~)>0xe93Iqm*))cTU-Tk(Fnl|GNYK zZbURqWQ9;T9v)tzkQ?*u#Tt&+pW~i_<)1+Lc?`Nu2t8sCNXOu^>?c60I9#X;1SX(> zB>3Rfa30}P^%uy$KyWe3Ln@BE9&!E|!1iIdZ5#FH<@lx^d@o9_7Uf}|{gL5zUdg7* zUG+fhiOXK%9Q*MKIr(G0AcdZ2nIai?LWBo^pckM9=AS`ZZ+JicG$2vVjqcbGD;;Mj z?ERE?6-Bt`F7}(T=n*~=3rlA_Yw%V9uL+b9Bl^xr6_2rxuR5$m7ag-J{Ojm8epv$< z3LWFFTP&*kE`8ivI>Z5#B4xMOOD*uJ?LrQYS(AhrY?I{#4B;Gj{isuu5x{5JsdL2Ur<(fp%&1o2i0)U*H08u1J&sMd15zxw1hlc^`Tk zD+T6lWR_d#ZghH+$eiN6*(BtXQwrzn&YyI2v|Y2W?~%bogBIti0Tt6aHG%^m;)aJ< zMId-*7R3_ECj6fA)itse@x0&~pp<;W!h#z_%Y`>x9#gWxFb|`eT}0NxJT425QAv{n z*Z)Q8oR-SyP#PN0NwR`RvXSZc{_wQrj(Rnea$K>e+QaFLpKV2X`Y_{}MDc#ngVHHa z>a=-b-qB*aYZ8H)YAs^oS@gSEk9Tdw)+zvS9v#VDGXMW-@ezVDR~W^UC($2iWmiE4 zwocU>!UB0(x!dpb$q}-NuGy2x|D% z)ZK0L{b5!1&#SqneR>o!J2KzjM~Y3$&|a;kQJGPws0yxvbS$*V>p0sP6aDI<{$D;t z@Rp2Uqk-s8?_h~%c5fH?sog))Jmu6+{`V&Q_e1w-CYUV3!S+HWD*W-+hI+5dUmp8aGoxT~j3Rt`5N>t* z8<@Syf$K58Ev=br<{G3G6sYE;q>v%nSBiZ@Dg2tJFM$Jz+kk)b+3Rt-J;1<9r*N|8 zsD72opHCNtDkIyM4=`1`}JSTZFKYioZT=@SRpmb~>@FV6VGtZa9~`-ScQ*>&h>6@hS|&TR!{WoVi( zboJNz0LH_I!BJ5U8=IQce%Po183q$w>FzF|`2_f{j!elw?!i%p+9x2C8fDMbB!NB6 z%wE)GBQjtUF(D&4#opYVfAbFz37P+U!SR5=>!mq&)Y}C`hmKy1O~g13?t;n?YK1 zSyfdR7}daO3dnz5-Ib>E#pgiLJ$YWg4`{&xN>GpST7jS3RtF9@$stQHGja=@fR8bh zOjN?WfiHRL5Dbg>qtimZ2GmnWCnwKcT~E#5EwefbLJtghl|gRiB+Rn^Ggr^^R0kJ8 zpnNz*X4OJn{?i#vD)Z&%Jl=qriA>6*2w8+<%L?eT$Y&`I9#i$DvKk>bdl=Rp#mp9W zz~T40B0NEujD^(y#bN(_99a_Y5qh-2g4@*yj?eSSnF&KvQ-okaXpYfEl6;N0Y7c$@ zk;j*tm{P251c9YLY-g|$F$(w^?P4H~i-XYR5g}ntMFlBkh4Z>R+9W`+MF!KXxP9G% zjwvx>S8{^6n5}gbC)I9b@D0@Qu5dz)k95G+_vRX?r_HWTx3`vBT2QtIcc+s#U{stgb;TWp2Fx-A?}x zKf;4}82t7Ns=(Ei5c{q(46naQ@%$O}DX4D<}V zJ+SarOeV|-IC{|Pl4g~(1^mxIODqQT+;XL?H{4wofc#DStgX3L+z=KtqPKQ}=#%O$ zq@t(6BT2>#RtE1|V1&WTp`_6Dn@GXF)7`?_Gu6|+9&$&>653Jx2J2W7A^|_4E+9*-@q}_E8%A1BKqoUeIqM=R4KdIfx>_F(9d~Zq}@lP@^s=+`u$i?Aax-#K?gEe;>ww?{8qJQlp}U z6-x)=UX52R)HoQUd0_^U&bNu!x7+ery`GpYuFJS2H=K(+8x5EKHM|FORzS2V6UKQE zsQlWegz;6bMyj}rIA655)9MRyF)tm#{@3p2?j9!_LsKv#qnLy5K+QUhE-5q*>{SI> znzd|(TlF8AEimI4Rk{!04O0u~*yMTBxr8M@$ptAH85s#bN?nXd#e)s`iWB5nMin42 z&U9fAZ3OJ1cnBg$3B(y9VSRpcOW{=I@E}51yK#D77E@p4^E=zrE&!|$cly8kK_wdH z8HvTr+oq<<`4{QL|Hb8q0;Y`f@8*~Wu4q<-KLI3en78^N*ZSTyGpqMC+7*|_{sf~B zBM>q17>cbTZzh@0eq55QJ>SZ+l9X>Ze;7NFqeSe1Ap70!&k+gScJw$`;J3-KoOd?>#LsR~f1{Q2EGtwmiGvYrFBTJ%9m8yG*lB*}0dU}Zx zb(&9TG&$zDF17k}dA{1><&^YQk6X{G3#33+8u|RNqufkd;8&pJM;eKhcE1=ES3!c`?sWjH!1o86 ztx{#d6*D>E8S<=BSW1Zc6LPmtc;&rfVOd(g{Yt+Zxp1QQ1nQ#!v!{ zFJa2J>tu$EeJden-OLPQYt_$q&NBb++9gm$+cKg*%`W^>#>UU%NBUG!q89@qWV@{w|Jwqg zX88UnXhhh3uN7h3KC-04T;ZmGxmukxuB=58zB}$B*63REiM%xN;iFg-sX=6|EzGdm z_l14Zt+dL=z^(tjjh%avf|PpHXx0@DR8t?R#e@zYl3M-(n%Vrw{pDp7Ua>V<654v} zktgfDKI`$?1gMNUGu-W@lI>36#2o1xNccuPH7TkFwg$u;NO=9iLPFoghG{;du7Q!K z+0LH$wJu`haqs1VUUs(#1D^LXMGEF!rKA}r-0<9fq5HPV)9LMbM%af)|M)6GqrDky zijXE0eb0$MA2~!F9>~wf6TogPnvHr3{!{n(dujOD+XvVWz20A_pkLUkeYu&~SA+S< zTO4%}fE3`z*QmX`LUG$c!p6Q7WFyu8553dNk7f@vwr0c^@pzCCkrTy|!QgUiAXnDp zA{;bhIQ>%pypEGYQ#1eNu=gk^R*98eqbgwH8l+#93TtT zk`74esGviEn(C+>m@)yR(QUVFciMWnRfqzbvoOa1^cCtgQw=tpN~Lxd7? z?Co0^L{Gl4#E){9n(}9}k_1XZ4{b}EzC#G)2Po07{NbowjZ`N9=I6WEfE|D)KR+xq;hQoG!=KNsU6hUB~?#c07y zM6LT3gm35%mZ>E4csw@RhNn!l$j;4vVrVRq($1i%=bC9x;|b_5lEDJ^{>oJm4N0)l zC58zy3$GXZUEKq+!9nlZ>m*G(6J+2hlb`fJGkmojwDlR$V{<+dwLkl#KayYpcP?I8Ohc+#+au%TV5S-8wJ8+w16 zNmw)qa&#^VYy}^2as6N#xv{vXOa&j+WC}zF)ilLzgRP2-U04(ES5{F>jewDh#!`-s zZ+wf0mq9Gr!k<6RA|ZQJ5RrTcCsDvh+{h3UR+N@%n=u;gpHzERToOe4o||dVqXm}{ zQxNoi0miM30=X~RoXh-cC76=zJ1`AAAOxtuhvTc{b7xshnD(qjSY} z;Vn}BQQyWY4oI(0#uYg7X;PGxpBkv?bt4|r&q1oa=E)jTnq94DI}0BoK6Ff74o8 z+7;{6^n`|;)$R)NnYa3*w+nFp&KI!yhC<(+%TQxd+0J8+n)@k$R#I<~qx@<~riO&H zhDU-Xnc$MIp5si2gzxTqwVeyeLrnGcZvaq5Jxawx5Y%v3B%Bh2`vXxjHXc`}uhC=> zhSl|*>Z&TuE-cKOOP)4g5aaJ`?y(KxIY77q5Ur;O%Z8a)$G!l)@S{gnntA8RVZMB^7@@$V^pfnEcjP_a#>L5!Y zyeh_)$TOwJlmO_-0(9nL*^Sx{0(d$=iT~>M2)LavHav{$T`|F)*WhBP0wyle3!h!n>c)^`}LQ+TTvXCOkZo#xfQ1?rN@=6s94e#luZnW%d2f-~X2L~lo zI{?M2N=01uwGCo?1Glqzs>nNa|7}Dj7Qn7uzPPG9kZ%Tc&hO9Upgh?kfD}>tu1HE4 zDp9kwnc-U!E(5$*T3wR3xW>TuC0a_x)?Ty=Y!l29dz+i+!om%cP8@%cQX7k0@%<>j?;ufTBU*^ z`IlTQBH3%XXOw_61F`5&B}Oi5-p)P(Xi5fPWoK@*S-yLS$LylG7+m zO}wO}WcKhP$bq^{I1az{6|l+ElQH;)?^>6xMnSJv^1oEMh(0qmImf@%XB+r00^ru5 z5+qdvMukz`%2)!tk1Ja4mILiBZ78tyOcQ)lsAG%I(YtfF!PKp?hk!k>f$I@6w@wGu z2%tBfU@vSA#}6JDh`t}Mqi2zjg$@GB?r@$slC3=GJG_ee-JdUxKL&lHBkRCmp!mOMnC*sMeCdbftHV%(1{cH{pKWP9S7S${F$vq{}#d{rls; z)#Do0L@nO6-OukfutMS zBcibbztgIKZ<}8B(1nFh^vcn?nPXl7CL?X9qKk0@?YB@U(6cRL~E9Lr}oLZ}Fo*qn zkHmd$^mRf5oH(jULVt2p-Fg_yC^3a{U<(Z2&Hy8!uReRlL^k&veCzhUhi@&Ks?iT%PUn={zpG8YKk^!HZcem2z9N7`{dc4lJ zl3Cobdgn zv0L(6w36oNnSGo7^Mb`f}Uk5ikP<7Y;d{>y);zBU-X2{U%*`4k_!6X zU?nQTp~;_~6TSg_rXP@-G&zfuB+-XP#RIVzd{!(yzSI6plbGB~&ZMY#&z_MuElZya zrqlsay}1$>eG$NC_jmieQzF(yEt@WTXeY=xPAF_5%!Qiyp7?@1^=7>IP0>_b;JRyI z!Ox54a_m3*f)}}W@a4E>!1{bEoy?gGY=J(t?E_o1FRw)AlQ`tOOCL&SaAX_<*pYG0 zMxHI}&fBXrBE7OeWa;*SDT~cHGec-;RuqWkk{6Oi`R7@iEnKNH5Ca<9GFeaN9@0@$ z-#k7#d>fkT^47W(Fm5(`Bk(dr$0tLDU&><@A8!4X0P^^nVGH}Zp4gQ9^6|Elq4nM6 z+49b(#OR2gkJz*8!vFNNLk&s}+4#-m4`x1$^t^2K`*Px_>X*yX|AGWg-1VQaqBkBr zoS-Z{!}9)xMD};h{JXk!goIMW0dX-G9bv7VEL#`_~u)@_H?rIC;4Q%%g!*vCAf$o96ZPsF9=hIn# z_;VN50PCiYA~}ItGXOHE>+RKb5S;sLnrC+N*_LqXz*4MdU)a1^leiH$uO75ha|vZB z8WhGvrLGFv0bKbR60+K!Gb;nlr_T6tv%tFJ&8df>GF%7gvR=g$Z4ewdtnM*ULkZB{ z#6_QNf=NH0znn`&B1|ix8mQacCSQnNPO()LzE)=$Uq7?ePTl&gokv z((I!nam_E8Smy5H%Q(wgpQrmKA}{9+dz|?Y97ft*t%)~50Va{IVt@_el}5$H#=tyQ zc}dK8WJ+ZaD?W#7=NUa)3Y$)@CS@V}(y1=7E5eRVg@8##YTuZZh;sj^`h#aE+`P|u zg_^GF@SM2-@|qc$oqC+|MR|0()CavbX1bEedh_`DI_;78yy3GauFgZG9haD!d=EtL zFbYS!a;SXEVz{8%C0>}WS= zn`;SfBt+A=(iV7JN^K0R4V-{{3nOxRf8xgUT0%;e4McwDiX9c}dbgtN`ygpm|nvws$9-?*~MAiNS0GfO(211GQAK6;+5{f=K z$ppJPCqauq7;lPvGgpoEnB)KobbR&6IG~w?|xi9>znt5w{9!Pd}lDc`jpYGrq8JbcW z>6N@FVL#OO*gN)dI6}j{WI)`w*UmU(rGt{pLUp1Tup3Z5Yc)=FJhRg@prMn|HpM&k zO2(PlS=M(=S)oUu;7T@;1>Zf(44#O2Uspkhq0mGgxpkcm5cLC`R{@P~-#=l2w(mrx`^9el{ z=P_)5E>?ZJH(vh0%%>^5VCcrPK_!WCyASGbZrM_Jeo7ZIvQ8H;d1msk(6O7hOfap$ z$O47Dy>M`FrzT`{zyz;{gyaW4qkJfdrIdWuZOCLH!96!o`_d%v#O^QCizfqwF>L4_ zQ{Ec^E5NHPwe9>Q7W9sgO1B#9rbHY?cy?kc+s0w{D+Atv99Mr^%8fGt=z%voFGVni z&9(E!mxdR>v;0NEqBO_EJFRo0#x?Ilqc4J|6uIQ4JJwNaK~+Di20q#%E(mwt%vP;X zo%tw>-b7@5(oCK;9t0^q#s?5)gw`lwDLZGmwOblhHF(F9r$1_?E-V21o6t5fd=U`# zezK1ObecQ(H#P;@G`LY2g?AHcL4w_U8{Ys~#aHyb90$E4?Lg*u-X-3kIcCUqtd>-3 z(TPve&GfASExsK$21=>u8L!^Y1Nyjb+3Pi)W5Ajz-bsi{#%BBs1JhWi>)>NXuDRJp(1p_f4Bfh51Xl@B=L^ts5vuPic;#w z*5NUOsne4^Av%hjk0%v}N3bO)#0sSQSbyq0-+U1I<>8Gxsv4uGPm;5>*{O!iy;fLa z40H^OP|G-d z7%m$|e!ls7fEbJw$7p~vfUTZu7Ec^Oz^NopbYIv;94frvjd%%cndU3euwkZXc1_Bw+}UyOZAo4oN# zz=6Ycvi>+|T#RtsO0m|vAW z$Mwr*7O2Y-??yNMl4mbzCtJTYeco!}yvixGrluncz2X1n&DoN9o;opt1H^2t1eV`g z-+B3^RG)qO@+R=gnSr2`?NUb3qv&~PH*Wbtg~u%g$ZLF3m;HW(yWG@`RS`4}Jz^D7 z@YH6{4rx4r7z=rNv*J@V8wjIb@m^`8RJ65Ax)*;5Vx$?=1jU8~Fw(J3Lt;)9-;XU5 zzq5~A5mW8?^w!F~nYSo^fxTSCc(~sTMo<3N!@q*daqc*dpD&W_sh9lm$d53V$Bir! zoq9?XlS}UEjZ8wWiKCheeHfiltdX~PL4860oF5(Utb!=7B-baGy6~E#{%`h+?{2zJ zY>!{FdssWwC}`dQ<#+E^@s)3+R#Rg4cbYqrcDQI*g6nDSB$@w7F=r zSg}FnFj(HdcS!Ur1|Y@v?GNz}bJc~52Dw$7AK2*e9M`;!47e(w<{>sK`F-D4COjNV$3ZMH!yuW#rm*1A1!Mva3S{nsu7L3**| z8%w-4bcH^YnITgf?eG-H+#3kqVqk`LPy@jX9x=9e39P~`yM}zZE3Llg2kPAiayUeZ zQ9VQfG4am__P?OE_bW{r3XRS%{(Y0kVNf}8@ed=Sp83lj3s?BNFu?KFR5pT`-_J4C2Dse_{xuNUOz%l5lw`?dvYBJpdJAL(8yE{*-lM=Gl% z_LW-{GU5t5rm7Mhs9zo7$P`un#6Sgz48^|RPHpjN4!bXlzYI0rUezmo_F%ZgDujJ+ z-(j1WPbc?8MN8_+a1F&3cNeEXLWb_gK?(W8H|9oHFJ2rBJUUEeP8;UNNS8p|d`syE zkEbKIY^r_rHrlTVn(sJni}05`9%vIbTBD?!8N#J9vHtuRGCK55ti*vbRhUBU(h9(F z4M70P23#IwmX>(UeVBhX>D|AMI!e=}V8BKyB70edaXGhjQ?XutyDLl)tqT!G`Ax4~ zi585=D#1@g>QufzYci^fnr);4=iZt07(4Daes3TbRT=(Wzv~XTTPn}`*QCd4m~`JY zA{WNWY$AP%y~_l_Y#^f}aH$V-tA_wl^t;~MtG_DkjcidzyPcD%!vTXPg*Pcft2cj* z=zUmM;nb#HnRwRr?xtR$7JI1e_!#g3vb{3yZ5iyol*j!k#bQ;~>*10l-fHn$m2D@R zI)~iI)o_-_qSVxqgh!@!3X375A+dZ$cBAEzNUYXbau`C2T--ratPkAMEg71Z2v{TeU7%MTy&j!17 z`!)vHlHsxq3yhJd<_Pw;!>K$m-EJ{_?%UZu-#2Bxsk^$FR-U;E-nrq#W6T;v z)KEHm4GSx4t1M`U@Uc!?RMBXBjYPSXoeS7Y)cO5SMB$`3Prp;Cm1-<4zfP0gK&ar( zT|Ajo<1c@dC+M?3d_-J`(z-1xWn#``&T6=~kvqs7Oc(z7>e{Dlg}Y0hmduK6=y z%M3nCyn8#iJV3w534=oY!Mut=03tupH*tHL>N%T8{zdogj00F%>)SiCkBJxE_itXP z^=e}k9zMF#g{xhoF5b^(DbCtj6&jjR*tLxOoU!t;y5}gIS@TN%YI-_h+wal3tWXxO z$EI`mA>&cV-PO$dJ4SD9gByWXI8TAb>q-7w78I(F^$gb2-0svV;{mz{p)#RJL z>bI6?#U?K2m|CF3Ps%^;<{p-{iIBD&4|0nuH_u|vf9XEA!X<5WEC81|AZq769M2|h z)OlD-PNbEF-y(!e_vzcMYw=s1_`dSdUax{6srT7Jg~^DU5BxNp;D7Sq7f?vbixLK* zdyRxm)VO^}m(28O$W`trVQMxxJM=HVQNQX&6X)9z4FqKl=m|3=gPkCdzJ`qMu8p$A zrE)h-gMzY{;nE!g0O1SFj#Fvc{>sJ-Be5Nwl6W@kbcA#^I-M@DE81|lsCUsIXIj(bN4>W{__rt*7B?Oujlbf?-`j`m);z=U#Mke4q3AbVx-cDWx> z^3Bu$Bb?@KvBil10UoWAjH+-s|Fy82MOFTQ_M znbQ?Q%bw2;UUrKbp4}HNQGQHw_Ug>Qrs#oP)jaR1_k$g;l#0j&(}{u6Ja1cx15Xy; zrOT{KS8Se44}CA03|k@@98mjR@GNxO5TO} z-{Nz0T+|zuvP<~jz59VNEV3liCdHh}cefE7TO|RFp~D!kpBf7$7lMN0TOU2`yBvD^ z78;?PobKIoXOr;VCxRO}!<|*D{I)$^e*7dY+Y`XxF96$rxE@>tz}HcYkl5`~j}vgB zC4y}v)T;%|_gbvG((D2Mv?NrjY%vF$4LVYjg+mbcg zg8(&9M6C>}dUY^hvkuwc9fkhM*wG3?)SN-l^R{2U#5o>rp7W}sF3T3Z{>cl*3l2DS z=&3W#CYK-(v=~knm9zYO`iyj*^9Ob<&7wNMy*TKQ7#0m@-B0abzVU$dXOP)X*AVoj zJm4LO1vG$EujC$3&ap~5zXy7d_QnJmU=CIMtp6%2;Ij@_tUdV+?4OGO@;VHtkjtRy zR|FgZAA;@09jE#J#sRP;E98)UJ_t%fb3ne%5(ne!2BMQ*rRJB2y=(BkDC*`KdlGEVKQ zUXbzlX$nN!fN8cr67;6Q%ynt7JBXG&X&M>>c)dg5`q}{avgBGPqMn9HXbG{h2X?mE zsdEB(0K6Om71Q!ethT!jD_^SXy}^RlfhVVGi2dKVS!f6$y0V5ZiL92CE^ z6fp>1F$nDceXe_yDt{NQq@!L8Ize`+TH0;-Vd!_m-p3y6*Vs-Ojv}y^0$xqeeb$^* zHXBHhB)Fs^oZmO{@&@Vl=G%pqJm)c=y9@GEuC{5n7f`EUl8x3f0$Esm$2F?c%w{C+c1MC}J(>ts z-glR4@mC-GK;Pd5|8Lpvct70-4C2uHrYFcZ1^*UcDJ$`+9Jc<*WAem4Ahw2SaskIN zSEge{)(F`x1AW;kIUF7^=A50l5u-k*=ve5FpYkmW0rvU+H7zd=g(#oSrVLUGyZfRs zj}Mn_YD-K)0yD6YObSh&*PPL@%VurU)snm5W>)ZttSv@>Cp`mG1XY3XveGfdo8I|O~mx&eYTH8a6`o@z( zO2b{!L>bm=)*4AsT3+~$WV|@t2aY?hXc&$CLb7f$t344mDR(X2S(sRYM-8U>SWDH* zgBx);{XXm8nW_TBS%K5-_Z@YUf~I@Ndu?0a4a~QQF=h~_@tAU_X}@RMeqq(|kzWkD z*XNc%yAYgKrd3q{9@|FA4 zSQSuCWCKM5lFe~oS`W_!nL@yXfqN#_mdU5_OUKC!yUVYQMr|d+;KwGIfOYohN{}q& zoB;{&2GjS5Hc~Mm3Hj8Ue-WHh>dIgt|NDk;3RCa!KlkEBnHE17IOl($S~L7kY-_3D z`~66cb8T0{M3^dIQ&)uZIs zQ>U1~74$Xlx%sbp#OLx+lEduMVcQ&-mtIqM}U;g;;hI;Ao z(Ht9MQ)&W2Zvb-z-Y~ZDjHB;vQtpH3U`K-5ls|GpdGu^h)3=nZrtYYQF`EM#g}Wqf z>|_W~8-XqYucQvswis>*xXFBXc>ye+ak;+E976MwF0%-{cxEo&51ztmVBMre1_l+j z0k_0PA>CxfY$}UubqE-5+MujAD90rloRqDNCkkK9!A81g8~~9>LhuM8hwf8Oo@i2K z+ZH6b)`S|G>uNPv?{x-6&jiL^hpXg;LT#`F<9W2y2 z6Jke!ZdCT>EVkBjWaJ5g=SIU(5)qbxM#^>#0~=14OLM^x4(mP%j2!@%dJia;{eK405SzbZd@KYbx*IrVM;**2$UB~b^RICO`zd3MN;Cz0f z$aL?=7b;Mss2nFK9lSGS3ee8H^EyP#>6 zqQuHq%#;;_+xf{mvbZrCnp}IclZY5>Rpj%1WjXQsMJ|I3h>0(o#fNO;2HBU1WSfuv zV>AA>G4nYml1G;xFEyvRCovvYBLYiZI$Kxl0&QLswTUQG`Gk~|t&%JgIbN>XL)`F8 zoL_9?a({U8@UTbRfCU5m822V$jV}Bf6cR|*`8cH?odYQQjyb=!nfR%me&)ZScIJ1*kf&;8tyG_GF>#DG@q*yP-Z(5!_eXIk+!1Fxglup zjB#vR_~6na8{a@u2M=t$bwxG1Q-HTof2qkwv)ai8uJ2q&&4a3@}!m zq&fPjTO57|tA)()p;rYg7$49OA`Ah+go~mK?ehjdwi)FTPTyYCTBYo#U63NB8nN0S z*ajP}U5nlWXQY$2(>tDD*WyZlO#(kEuf@JEvBPF>&@A~)PA|SeB40ERCNSY!NtfK! z_N5gL|9gRv%R{5a!{;hVId79KmS_DX2{O)~t7Sx6*o-LKs2UGg1 z50=;R)%4$0=09a}FoeELHbeTFxnw0hp{&-&X@@_~YXEKT((b1|WXs4ut5mme4*|pg zNpq)-`=OO-S%W3vfYIlocwL7JU8U?f^Zo!hyb>NF8o*Gv*@jWT5Wt1BKuZ!C#hn^9 z+fT+NGP2>6LndBZ*Q-OEg_{ChcBG>@5UH)jT92Nb@*FFC70vWra|Btq z0l|pAt#x;{mW;x4udbe|vvMNflwEDWiOz5>uRjeJ46i8`R%fowTk6oK{{QFdkgkzG z`qG5`2pXY&aH0;Ws>4;94oeOpaQa^S)QN=Huyq?}0p{M*kwM5hkZijfj+VAm)_n|b zaG0=CeL|}|`Oq$K7v$ol33QqF>6<|GpBE1~JrGc;wbTlR-=%zxkY*EQm23EV*-!dF zwG)mV?tg;ZHbz`NVEKGbMF5;pFlpCkrT?x%|J7&t`x|8r@RVo?>*yW}@VnT(XP-z< z-Jv&o%2noS*HkBVd;Ag_N8T7C@4RL3x+O?Ra#Vv)dY}*U-^U&&MK`I-<{IfI;w6lc zhPoO)Z-}|zoHm%4c!r@9F@0JvnVQ1(!4-<`J0x50B2sc-hXv`>%+CH}plTF)6qUa8 zS}P*;_5bl+A+cajoArbf$}hR^iyqk(*YLv8>Q?5!OL$72mHT~ekmi}e(%QD(uR2*l z!-w1XZ9)ZN^C-6MtTSu@Xh2@P6W`nTk%n~A4kZG$C@NV1%+GA?8gfkl&D)D`lGbqe z_Q7lZoV>A0FTBB$^)>d9Nl-$&uo8f3{`cS|j#im|_y1n}^!`6#QNZl*SGW5mkPL+F zfKc__rYjA({fGv80*r`hW)Zi0vh+M9v8KI=!H#K07FQuC#l0m`QvmJ^gY(wZX{!83rRTK@yUCZP1)&Qc>`pPW6^D=Xnd z`YJd96cGFm3!fBY18ZdMhpxvTcI;XFoXRKr+2nrCPRv=-c^ zB(;`-N^JPEPS(-%nkjIDk`5kZ2wavXl-763ZX5I8{~VPr_{$tz<=sRwx-Mk-Jg@X( zcEJ*0(iH)j&0{R~3qZ!XI1EQuS62!6S4^e}%gn>aAJVHps8U14{Muf4eNhYOIcHCf9#E(;XJD4|sn@Egso!X~$M)dNVBM^P8X z&6l+A0upKo*Q^SYiEG0V#5+=lug*)=i{4|}KlUKeCe2AyYks&%bSZ>nf^O*SoNp_! z`Q?9a<_xKzULQ{gQjY}P3sHuuvSRe@h7Hkrb{hA7Bz1? ziBEBf2t9pE_~i0PS#KE@)&_e)A6rV>Ng$DtL6M0B`#yCzkRf;%(5#k#Xb-4r@a(7u<&VY=;bMY{d@O%V09wNifgGNK#0$$ z!6aln`1>Fh-?czjC9%_)2$Ce}pn+ZF1RD)q8^FmHsZvjN>|~sEB8W#>@C6cjQZ?+DwJs8=iP{~@qTQ>xQH~<| z_LWn}Mts%&@3fl)_g{1mj&Lh9xouH$fu{=84EUUlzzf8)fh@}Q`NiM#a|sO}K+ybH zR#iC-oQLKbkPtx!p3;r@LlDtZKD*VPU@A2xT1H4Vg~%x<6Nt&tyoK<_hH;!QS+m4ukJiww zVYKUWP*A51c5XUml-LjdYRt5nA$+WklvzWEYRrS-T9&^z{fRmYmRXuZcer{&kN4Hi7W=z-C%w8lRHI)wqz ze;NwI!C(7<{_pmS@Wle0Aiu05+H3rbaXf-SJ5SYdw!ETKsTh#OrTr6aCn0z@w%_P+ z62S>R3uj>Hff9y_+KN&=;5lx8(UJVo^U9H&?$cy=zQgyvnL6m_Q`-P_?<_0drTtA# zuIOW7kqM)2^xJ?lNgt{$HLrf}@T5FAspt0J(J-nR@=U(Bz381u_^f-^tCwq~H_E+z zL2ChAy#vtrN`=EPEL<$+ywb1Glo0ljM5r2YZ;@84$T%XwiD2t47{5WAmkP^bSoNQP z>>wo+3nHSLGPl(yb}2n@D2qVou}+5Y{iH# z61V4!fP9j`jio8Eon#BXt7Qbs3vrsvAfG}Y9fz8?c@!lG9vot8WL$@vl-FK={SWTK zKg*3Xmjux8uk9X}1E7)Fvdae@r6P`eE_&tLRY&U>bwJcN=G)+$U16E}1$@mX#WU*# zZx@om$OVR*KZFd1V?kTXU#MCyKwIzqCJF{UL*9TSlArq~*pkwe*n!^znlx0VXbw_& z37AurF~0|t{vj1cE^!{QY4=%=hKWyU&wohPh6vPN^8ICKlgQ-f3ZQT4Ny$U&aA-rH zb|nHEa%mDv;WRpI#lX@avU;dACeMAHTRPWscMv2Mv@DrYl9SL)?=BcufD0B3C{^>` z_VAo4WaVO0LlB%jRz_8_8R&=};Y5vyb@7gd58;#}p^}O6epQwM1qVO)b6q>#fn5q} zyo`!cW+uyb{3y^m(Ct6EgMYY2W(**8)hO>aDQQ5;H>F?NETwCWYBUc??oAJHiIFiD z^Mv`*$H*{Q@2%r5Mk(ThbK7Q1jy%!x^3FbatHIQhYQVR z;krpgCSN|WSL+pa4*B^{dr2U9gv~W0i@zq-$Cre|S)F#v@M$jG2jFKca@0*`^KK{0 zfctVGS_`1mG1znG$4r|LJ}N0B;X3^HZj~h?AS>{H2i_<`If2BRh!oK0%VBN!u=0Y; zJjKq%fWBU_wn)neIASb0SHY$p{2#%J4pc_FHUjnE`A7>1mTwK(gSH4xCzGi{nTIzU zYB4P*?Z&Ka&$DaG3hXjil7gls{lI55@*Ow7N-OvKS$5x#cgpcNuKbK5#4Vbzvz|Y66m&9zkM!%1Ungf z!&z6jlh#)vh`h>AMi$SoTz909=a#mg-G>EB;YL#;v%G;GD2EO@KjY;^x4e^fF$c1Z zps0*r)}%C?uS1Q2*@PBe#g($GmXc-J1*n-RB0jHM;05i%jTF`Rq z=$)9PX2OF4Z{(gY_en=@$rPJPVN8KE|H2*?8W=?OO&q<; zxi$@|FGmVt+(ARsPI=TWp++a%D9I974=#%%XUv(WB)ZvU+NZE5{F$1s;|{I%^|KAS z8`C*yc+pNga+A^%qseun2 zXk69!w~h0(x*s*1x_Rec9sLZ;&*A>VH2RwZ2foq^kdQnTz0EsIThp>HVpoBmmXl$= z-MRV2nyu*#whI9}n*i*Pc1n4Cbd?nBCtwp^PXRisHDmo2N^46)y95y2(lbI%8eRn3 zX!w6Yqa-xJ5!4gX#;V-MMM@T^BUatbTmO@_{3)I4NXKm+dlK3qp@D$Dpy>AJAO3f- zbCb%Imh<0k*?wst++r04^D_VUNq_$hcLI1Sl%qn#p}c`0a)I;4|E5iUUS|zX+CopS z*MAcREKggRK9>Kfp#OZrmI07y;TI=6dOL3~b?P3J|9?OhB;>vO##qME_Xfh3Q( z*nt}2ukJAz2gwwP6R|7Lq3n~G;?~Z#MkGsv6`iAc`kRMv5d#{BEnpO;!yDvYZ zrl$7np*6Yx=RR^0p=_j9HE6mfU$X%QY~t-&VBGP$8`hwPBX#=?K}&TBSP2w?*_|hs zA5I#qg3898Fj5AjP|a+X55C^Vz_n$%Me#O3A(IVcjgy0}MYl;mzV0}h7AwN>o!fj*BMsx#vump0Hg$nR_BXQpXuJB@5ZGBMjh!^P`>#*rKu!N0y|V{8 zYs0Civ8NIvj!HjgFM*Km3W)GTQ2Jzp$?TrPyY$9~U8Ths(W4iq8t;?9R_kzjW&%y_m3yyrob`C-j{%=n3VGke~35zy5CG6mLbFiwzw~; zBp6Sai9ZJxYU3?#N9|u@Kz??e0frGprmAPyT+eVOC(*=<>T}KfN`SMdK*cbh|`!9(&M9B z{FXttHdOR6!^_y1+jqRZEA~CXfT!8arUS39eC}XSb-yS-2!%(>+0V~0LS#U4*?D8Y z*b+73)a;icok2DkG77jO`7R!)xt}))@9w@Bxbe>>uAFBV~UwJZm^?* z1H(!#A#9C5UsVQMdTUrY5NjR;L4CoKP$=?%k`B zTr&@FcsC-m@l-Yqn9C(^2mA_~4x|5q50!aHAp)_GF<;O?Fm{~%BGY{d*cBCnlwcTy zypApbRaLz9jH%e0j0HmOTyq^BMDwi<7`@ zp$J?RoZu)=+i-Fekk8aoqwLC?#cuNk0>Oxz;dq8A?-$Hj{|yLzq0_XZZeo=qW&M64 zl@tj|B@)f-4fa7Pj=l5)xU@3`ZUfD!Bp=K(IwT~7<<_5w@9(WK0UEY0`_K>X-8z*ZX8{E68a>f??1-qDLxldZ!qtbn$%$yvgf3=AT4R|H;BZp$(1LQXkU4PzT8 zBzOaM0ob}2Xt7p}GYhiSX@Nc}unxx}KiqlSqh-VJs*Oxv5?5PhlHU?Y0LX-!7A3^S z{cLmXPaxdTkcb<>qW&V7#igVyDa$E8-J^^L_D@&7fB)SUkMaTNS-&51O?>OM|XNLi8dSJPM9#lN(-;;$Ca<~_9Nd4HQQ z+tU^}GYG8AhV4s6XN_lT8E`Kt9=!b*8*?Us%;m|H*q})Cu4J z-9(u<_%$%OS4E3wLkYyf*DW5fvfi<9w79{(E|7Zw>KuaaC*V6I)D!c-t?Ds2osdYt zBK!8D^}u?QebelY8Be#FrA@Q*58Q%__vFqWb^bv!-sZAGrz_QM^yon@(uL4NndM9 zc-ldMRn(=B(o4YQFI7CmxXQaMRHH^y)c0WL7+PoS0+zhidj|lGyUqvgO&<=wOqPURY4!j!Xp<*as~feq-^n?>eaQOD*!VP zwS-5*cvKOXb86z;HwaGyO`M15-JA@j8fI_AeVzpV8*M_^cFC&5K@WHulc`5-qkGhF z(C5&*jiRMVPBPbY8n1*WymBFTV(va4kMQG(BafW6 zIP-a8cuKp4GS4Nv>v2!{tbvAQzD&pd3DI?$CXd-yB|}uGsun17s6KMe9Q}2dD|P5Ip2}EF8;5ff&Bh}{8F|pKo_(V z(E=*|PW22YkTA;5= zT?n?nvkNW3row}E!5d&nsj3TmRC|c63eslG!|mqEV078^)t;}6I_{G3)e7K48aDG!^s~|Z%7EGAWq3Z z4LvO%D4z03r|$3ueFlrn8Y z6s#cz{oNpq-Y%P%e4wiCUa(#^B{c$F^QpH9(M;P(C7zY*9pPLvncU|L={h43m3=Nl z&|9d8O^;~!&uP+<4na#@)Oxvly;`?a!hEcg=KgGFJG;oN8Ruk%jD@f@+eq_S3#!ER z0?frtv#%N1CfHx2!xQhe1i^IFVx!5{8x;r7X4yyD=v@+UO741CXEy$B?@-jE$wfK* zSa2DgOBxC5FG}B~g>jDEkqDF8;F10|WDwn4IUr4}nAQt(mcI_h;BCixagIwq&>9mh zq_&aRRzJ_URvGf7q2b-%rJ>_Ra11|JgM@DiVvYaSIE z)jHB$)=Meiz1OPyJ zh;;J(DG?nQs5@((K_J%RtWz$DM;WIj0<)p3+b+L1L#a$g?6}{!D3!S2wMqg4s0>Vl zs!`df-rIrH5=P!z;1F(;R$l&tI@Q3;ObGQvW*DG<>o><9_nF>;+*rh%?(3>vggsqoK{)Nz7S2Tivl20Jzre{wa5 zp0~ERy{dd|1g>%2$qE>TvLofErhH5H`0PXhapcD%!BJHf2%nDOB} zxc-Su87;THnuy;TJ)blZr#fq$fWZr{u-mLh%U#0HCg6t^W3fg)Y$3APSj#Lk$U6+%;tfk?v%@grID1)#e$x6t*ouz3uHI1$(DiDbThsw>keYW+~ zp4>b)VAow=g}$4AHfA$q$4^?zUfFDxCE-B=ynuLS`)s>9{ewjPYr@;J zi%a}7%Yswip-KmjU-4}VmnJ9n!u?tf$+s`Imy|KRL(U#RaqTLD$H9^`O=r8HfnQGc z(psNSnX;Q`sf1@r4=7DE%_#YR67RZrsj`RlQ9}xqCE4Mm%us}Gsub*l=(JYBpF=BjENe#fO+=Dw}+2yU?UcqFmiUmT0;ab>ab4GHR!)nU8+6(95GQQ(F25a zKYx{x?rk`@w+G^xlf4(fy$w5IAqtmluwOSz>9*3BKA(&+Vnj$NiJF%WfR^c=%T>9dNte6fy&m8F&cZ0eqy3l_Zk_b zMwm22Kx}w1i`)!cZa`9}iq$0D>z5LyT_n7cuZ|s z2cV&m`^i(1DvnXp)|t;B+EA2dz%&`QN;6884of zo!echryK2Hqx*uOV>&CL@4dB~lK)Uia?JA(*}eG>>6{Y}Nw`n}^beiwiYZ*(frwX$ zYL<28Iqk%vXZ?3VPXlui7jNexSazZjPv@F+rntp5-m{%tYk|9`zCHBe(JrN$v&->tGzKwo14{A003Jh_Izv z`=lAoKIm^t$);vN|tMo{9L(0EetA za9h)_c``?*X0MGDYvV~^j{*X)k=Z-5;5d7r6>8j>6jx~BeFovn*QXC~5}{{T%wdmJ z4YcBwM4U`Wnatq~AX`1(okD#s_GX}2T%YLqH8xregLnbMsjtnimO~m2Qa=Q12*C0j zZ*hL~e5EN0E1)k+R2Kcx@SuaJ5ma^(E|7AfFx-7goR2Ei#{n9gu=l$XQ?f|M zwIf7&@$~7NE8Dyg8y#(IHVn&>rLsD#cxt$Va883u%Qb52%wPpNedAn>0H-XmvX5HM zSVO5+RWAWlAUXVYgB)9S9;c#;&Y9@L-?0}S5=WV1ZnxUd>m;%ABZYParrul3=gOz^ zSHbDqLNumSqq|P?3(0L#Q4Zzf69RqE{N?!P$i8rT2WZYo6Ul z_0OJ0TTzU}(q9Pp++J^|Y_0rgwkg4ZKD5lHW;1*;us>6oLace#fWYKo7GWMNNwq0T zT+z#6;A#1Pc>Bt*D7P+bMK}@$jf#XIpp;FB2qKD70y@+X(n>c1l7fIJDoBT<_07Z$%n=sR2H+6!r4UgHQcJ5OSM?*^^l8v8xr zm3OJdpevT{Y&OsAqL_E;O^4mf1Irikdj~>G?F!an2GiPW){m?$6Fs9iMBaOxEB4_Z z?2)F^wlnHKE3wbui1gpQa8tj+$>iPQ7BdpAVcP_@IpzIRo!o!ginIBtEMyKCIQ!Oy= zm|q&FVLbhPr<$Rr`Ov(P%XOuv=N2N}3K(@pdSI#fI}+opXP(0o6x=rNc`U12xyI z*T#=-sg*k9g}*k>aNXx;aqtpZ?RO@wsu0JAVllkaV{U=~w4TMe9Lzb4H}e}Ud$weM z{+`n@jRLPP`>%;zGtqj_Fn;=&8DwR=;!h8*osJUxL`Q8!M`#f%MZOr?!5q(;N##`O zEIjZm;k8$jA&TDW%=nEE8}oRp-8Y|aj}$Uo##!xpZ8D5k3vHx-SZY@B+ON&O_TXo= zoEdWA6-Hu&vi4Djelg6#kk#)M81y3>`$kP;Jo%|f%GVyzyty}m?%JE!(5S#GMzRT%W8ineK$g4ifBCT zy3)0WVy7ppDyw)R6f_tHzAW0A5?k2hsHB-ikhpO4M}$QE8;%yZtuZ} z#%WpnS#OH!mz7mEiisB!zi2rk{u`^LH{@n}+v$xj zMl5xP@ydA;he%&O5^-OHG&dezlVjU#T3 zc=p+IiTp$9OQUKH%Ys673s&N`YGxMI(b#b}As1Hv!&$*J?}?krJh$!0M<`k?qN#Ro z8?*RXHSrYG^)mUaQ>3C)1YbPs^JPVb!TnM!Uu@a@0fG#MSvrp;hBz_((2?V%2efKA zHO|7{$c5Fg9JQc(du;TOOzL>exYJA&bch?^W#6Q6dapqPM=(VX5 z%;Zu`n;aqW5xX(V2=(gzVGbcrTprLau3B6Z;+>jRt6Ia&rWsZ-w4}%@j#8Yf-MLG= zfz;dI7P>k;C;#kYsC7V6dz^f@m&?;u-a9YtexhF^i$RCw$sd12xdcG77(4Myq#8@B z()Bav{5#4(A?m73UX2GH)KxF+>ZZ5rjr&x{lbP9_#2A;?tJ+%hY2{DK080VCp+f;e zQ#56bUERwc5+x?d4CTAP4l!rd+X2%|tQm~7z`C75`a(z#3VWynCtVCz^lmTy=Rv=p z&zr*w#6F8mtBh93<^PBSw};4Qac8H{A&d3;A9??uF#>PQE;GU%Q|L-4eKG}9E{EAT!G9T$h)11Z7kBR{>Lta|^&#PcZt|C4asHQW*e<6E+zL1xF z2$dy1Am*jQwB?x&_90lsH}U^mzyeLNV|Q7%LsS==ar?!*axp@J)EJ=Js<(yM9L>3t z@p}Kbo_qial{ZQVo<_4iS&&|hpFC!;qVUHC^`EF__9BEjm_Ag~bD+xG%=mQS2GT3e z9wn%o{JW35O1+;FKw&*`WvZjLY->>pj_gYjI91Ti zt(#Z&Sa$U~yt_HPtELEcala!Qx9`k!Nh1+NjIimd%%m6Z`@--getw|&pYi$`gNqDq zaBjtsoh$h=atR8r27q~Zf!$m)gu%P{jT3$h?jo}s_o9r!C;-a=-MK)0erax;(svq} zehO#~wi7>eznV`$+phXZdrScsQnWNV3YF8D zkvrGd$EZ?KsX;c-SbNnDZw#e=p^>NQH>{fgUQL6sz|X_SW`mLx8mOWsU;&{BD4yDy z63tHxHy7c#U|5;LoS|{{uz~tNC_@Cd_gOeXHJ!Jd{!~6tAZ4wu14EJ@%*3Jvw|6%TS`hn|kTtSvIpw(3qqR&Div*uoKkXSAJsj0IB_rwn{Aa$9_oMp z<2Fk!dfFLivKRnQWg}`(uZq`!jhau4V9Z41Mwr9(#RAJrmhQ z0-?A43f1T15=%gBk9Y$lm-q%u{?Ig`zGd2LA-q79iEJ^_`3e;+a~l@Zj&Z0(7o2$m zJuw0l_CFTB+JZ?z(N$yTL#G*;JanMPSE3}JrUG_Lbhie2^lmk*&u6>2=_@ty^U!$#~6Dg`$V6Mes zO?B#7PUKZ&F-697{L#oT1v^SNn&o${dHVE!^ay?*8@GMr6*8~9eKTJbA3OB~5LyJS zc3f)ak96%%L2=Ctl@|L_e{7yLJB1wu1}zq_ph3a(V(Y1qYJ>C}H=5Xs#t;rUNKbL_ zj@D_EJC_0#$1 ziV#A(^FDh38pbDM+{;H}Mv?uF zxtt1~V)KROZ}L(ExQPZ>12sc zN$NV-2~3d_tCc9LL(YCy*>9HL8uWSzJoHbtfm&EY*?PZPz^mgFiA+ESV7-{#q4W}x z`f9&Si!r@mb$x&Ylc{1$x6CYapaposV-w-}mik$RNgIn#gQet2d7c(?wLYl1$N1hV z9P^hlhThrkVmmWHP4MO;vbaZ76yZ;()q0VfsBwl})zfRA_sDU?fv3RXTE0NxP|0(v zr;B6tv`kF%vZGr2j?x-}A{X0zhTpTAHy2=X9Sr{0Jqr|AT*$Ftk=0|7B6llaAE%#C z;MFWEDJHEg-Iz|5jaSb%jkKBT>kCv5 z8~yNy8!rZt$~B9i*p>C6Y8U1*Dl~*kk9dKE0DBmbeAvvF^XC-ey{S@NHs|>54jq|8 zusUx_%SMXW`Xd|@^cLaL7UB+s&KpuMNqg)8GgjBI17=9%7a8%pw~0&zitHbX(oeFk zk3_Vtc>AscheNHKeClKok?QA1rw7|z+OrS92kO_O2Ky~~S-CH{Oh*9VQ4DG%ru9NOt zAe~zaY>3fnjdVmH^{QJ4)*1@9H7;M;HNbw(ceCQG>=FJImUg97#pXOSXj9?Z_olmJfrcuo~-97^?x;_F!u=de0HQ0C4jCM;{U z;YB|*>QNxjU^bcsYM2A<~e9 z5FXVaysQrY9`X-6vnL4|BqaBSE~YuP7Vk3RV(*Uv@fMB-iglRrlj7rXG}+0?Wc*h~ z(t|(FMOJ+Z?7SI&5L`RmlVtFG?m}X#z>SD};#YOz<5fPsZl6(&ULa&z$+EJH7TuOJ zcy>oy!5ND7xuddz->4C0P=AxZ%mpX1i7(o<-!-5iVxnx7rb;kq2P~+o3(JvK?%iB^ zil}R5;>Lcr+Falf8zInYGxe<2&-pbl)_rEYN2mI!2Iu8gG>`;$3%Z>8T@nKi(61D|~FV$W7q=!2`yINOvCeTpu4m_~5>`-#%g>u1vZ4 z|8y~bd+ye`R)nVx=lBc@8{9_=8`6wY?pk?>Xs^Y}NSTb-b<9PP9n@3ys_+Hy49Z)j zKt%fUM6&k_7^r7zi)ut>A$q*Y-8AxSq8?_~Tnb@|(HdOsvEmJ_t7x9&L*D_u9g+VDAI zP}ga6wxy1O@uNWJGC7hI{ ze;r;>Ke_8VfTy20eBOCv=u?eM{he9;`;ywyXI`AIr^Fl)AVr=Dj(-iuKVJ|=h~AcG zhdIrA|9CtdLmc3G^hPJ@OqS5}b2Wy&`O{xhu-^G5=fa2_%#y#78M@*_nVF0J=ajJG|Qe%c;Te{0Dq6rp)d@)_!+odimT>^ z*nj_BA-Co5d+i#8<_(>Suj99!p`SwSr4NkobNJkWwhjdS`L{nX!mln)scfpIqQ5p> zcB8lG?+;O+V0WM*a))0Qm3zSOZ@&ruf1HvzfL=cQ4AsejsGM*Y*5^h#lBE^|h_rC@ zos@qMVkowe-7^H&I8WD|W!V!=M;%Obt}fdoOV-Qtsr{`&SmU+mnh;BTm2d;Q}4 z=Ww<+5Z?#=KTQ9no%Z+0{{1z?cfNeWJQIypEWZ>p!W23!)5Z3$!_D&M)c)+Edi?4O zxS-o4myru%PuckAb^P@|7f*jNc~H-k2zwkZQ{fij|I6scQ7-VMq#X->WOdF(BNwJa{jM%`f?vVs;qEY9pXT`uxN#kL;$XBzeEb@o?6vk7xw!LF{C#-$!Esle}oBq zYd_MY`{fQQ=3k>2S%;!Y$N*lkQ2%%4HS(#lit_y5cM!u4chFgFBmA#hGZGMPt_T2U9-s0TnfI^w{rd~%PPhZF zu!8izG(G?83MDs(-R^+j1DgM_+rb@pguxxiQT43snVUUd63D_-RlH3R``>pUhuneT zNKw@Po~jCDsst2JdvdHj_d#w5cW_fit>=G)9*i6D|L|(3Y7M^B!*b_zQCx(dKG5(K`aHhI5nE7$Qg`S_$LUx+TCB&Wp%VD^*^yP*S zh?Zz=Cx@2c8BPYgS!m7hBtyqY1~zGHH<8k0hcpBtl3Z(vAe3H+t&VcS(qPa%BYoJF z;&WHeyTCpeS^nvM>0jW(-Yv&1njCliu|LB==11G+(9!AFccpVfF6wcAuzt+(=zcb3 z#wA1Lu;|E-Ok~@T*qjNXGEIdl*U~-7OxA)mRll#!u|4rxc|=rZ z0ny+3;L1OvU!2PggXRHQIH|$H4h6f}J8|UqW5k||KV34dFJB?Ou)nNk&1`7q0u9#( z28GGt;nE%jrHmZ|s)A<+5os-7=v3DZzbt53mH&u=bpduY5Vx+oN;ieBQ35$y z`tpc6Lvwf}!2crKsL4>!P9lR<)2>LOTnuZ=>$6Ju_byBV4=*H07HaB9z7ybdI#*@8 zR_Q6b5OaFS!mCEZ(nrlteya@CbBno{(A=gr>S;EcnzOy_z2=Tl`^o%KOg=YAX7PM= zJGhw3Q}kpKsaP`@ssA%|WM-le-N)G#=T$?^r5}FqaJItTyAm{oOW8BT!v!aGC@Hxu zww~l%)LYCJEQ0Jtl#s%F*rqv5MQwY7qSNrwxrWo}_EwPmT(K-P0-fA}?;XAWT#xd1 z5D7r6*=H86AuNV%*e00GdP-Z|)5Gej$DM(jWrkhcUw1aGEQ?xvL}B}4HE-+G{l}i^ zR>CIwlq6gXA9A^#a&zry5ixbrkg3ZIPycq8lHwE?t)UX8VHMX+5Q?3$WxD(RBZP?~&siMveVwOUUCDa$ihJaxx7^8O2|@$%=Y(X$CQ z1WCRYL5`QIw#y&h48H2PV6mv{3xc@XeVFI4R>m0In!!UsTiU(Q;hU&oity@U<7%~0 zMsFcVH;US z_m!8a;Cg>W-pmM9ev;C?q6Rujh@KrJM$@d6;{cQ*Ia9(Q7>C*+v_vR!0bcV=^$Bn;=Y(00o4`?I67pvjjbsn>uj;z6hBjDF(Fm7)`35B|<|$ji#s z4ENCIe&YDo$;tK@1X6pKYs2%;)$)dlSn|1vfnxKH=78U z**%|%+T4JP+ocq+u2b0E>L@H&%TRla{_SwKHl4a|?U57jiktAvzJo@5#+m3(etD&v ztGy_h@V)>jQ1XC8FTM<4gHfm zNmj&g<0Gux-LHfk$Cj_nK2GrW=icc?H#fW^k=#Tj!>%+$f!nO1Lkv*3hKlk6@jb1i z)Mx#dK7oD7%)ujvUgS7}HI(gq?&H;s#P@v+N}48JT#C6nqqXSpjg`4{ql`SuPg-D*UGpN)U2Y@#%e& z?7yb}vi}ipx(PbRu*|!6WYFpN_p_h9S4r^!DtoJ@?NfH;;_wsqy_q?75y2=L18fn)QhiYVyd32 z_{c_bm)&FHl1q|npjK znW3!3&(5autV1@Z+!mkP%jBfXbWUaDNtDfq&+5b0C|iX<&ccyK$)FLKk^z*`w-m<6 zlY+>G(4w52@XrlF@HBbbp?>BUgCAQm*~)Ee(Ps6%kyUGNWvSdUIm~}#&N)voNHGew zOTucvgQWP3$&%}S-W(xvZT?}4I1mck&9Y-^lSdiR*J?7q5hWn`?Ik%4d5t@b{Kw}s z+^!mpsAU%AGqiI1$&~liyig`5i^kF1>Xf3mm9hOol0xn}MOHL9F}a~DA?4zrffob& zg*SIE-qrWtRq+kf9e?NKY;aNK8raeHr*ij1Zw?OG zToV-)tVF3pxAuIg`pW9(gN%>z;^Iu|3M@WkZq@_t`kaI$;c65g`z-Gjt?D_~Ora6p z;|Fg4=aP_DXVOK_IO4`V2k!~%T}YVnM1MKm5a)$CT0AObo9m%b+T+WpwP0<4Kt$$Lycws+}%V&5ngyRw*C z#NcpFVx&W-ydsFWmpG+(-k+yn<+hqpbhNG5Csc4ZfRd@z><&)du+qDKzm{FK`o-T~ z?js;(cKz!wIW;%A)yx$wyTx~BY4fZfTX-o5aDw zEP^gLVpVLhCYt#z*@|hI(-s!A3G}L6sc6iBBbh%mO1VK2O#8LqN_HdnYQd-^jcqF3 zS~EQ4VAn{e0~GB@7L?b&(Kl|v%Aa9$>f|%fEoi_~iJjknUdt})oL#k9G;KJohxG);Gn);!xsK*9S4_;Z+|6~)GZ3xzsbiBuq3&~08# zoNBZ1vF3VzZkZELk^K^-*I<9XS-#JE7gIBrox(x$k3^y~#o7acoWD$gtEeEU zN3Itpl;?3*Ls(OY7N41hPiriT15Q67;hQYZcE~>H{1bDZ+Lq_U%yKq_3K=EsBlkoG zwN-Py3j$;vr`S?0eta#oqRxCkvcD3~X%Q-T_m{O1ZAkLY<|x{8uztQoN1jV5=ol8> zoYFSn33#AfAC5D-7$anqZK80`x=Z9RJo3@rJy+`Dit zE|qxiVoNqAnC|*H9OYlD_vfOCenDE0x!EGblk(2_v^Xc#rnG5v8rf7Tee__YLpbfM(8t6NJ!0@g`2AaX^N zq1JUxuM(ePKPYJQ>y^1F5OWJ{8vBaOaDzt(mfA5nL}p40eF1-x#_bMm-NraQ+IG-} zHZWweFLcsK;M{Rv+E7)#$9JPH*2JFRKQi(4B9@$#+}4XcK>gIOQTgDo8yiMdSYA2|`1Ib_T4{6;yB7K?A(j`c80Qkp9#qecZ;```lAIn>;pc z%tjXtWbd5CsXQTrcVAo4vi|#K0iUFGCOT8-!!6&s@JyboeF_q&h<2%F>Tf z5`5)-v$DMw#P_s7k6Z1SVvOCxx;ZoP4}QmWKG&YSS&lb%k@tM^)MN0}Z+don-Z)T` zISOyDz88v)avWXuN*x^CCNqhRd?tq^-iZkv8Y$p+-rWJef=SAy_5! zSyV)-XZ?rJOFZK^9rbe4pcFPyoO)h5zk%7Nk^qIo@ zCzYPJr?nd0;b-3|RuB#67@)`PhC}rXCh(@4tC7RuD%}l%>qqaGI^bKoQbLKGHLD+z z&J~wB73%>3EGR%nr7^hJ=@S%LYEHV8)a06~_jTxO5>qqJ`c?RTdIRUvzpn~TMP{SV zENI)Q1PXo?bf<=+%82k9IRR?6(JSufcJ%J;J}tZBAG6VW;;Vt@%}(ci`+&S;iL~ot zOtQZ7TH%F{-fnukfQ5u=<(H3NFN=RdZi3lW^bQem@y&+%mcKtw;VM#Iit2yRD*@d% zmo@bsRz;^rR~{z>o$&El-{>iF&b`6idn0|~wY+)oiD!P_9C6bIE15ErxX;o~rfdD@ zyyY`Vf8XW)ZTs|iR`L9>{@JrZJ72xVxsqu3QuB>F%QnaT+SanV*2;o0j2&w`S?)75 zYXg^K!r3iTQH_R|O=rc{LO#eeU%X_s_8_14hMk3s3KKG5%p-)v#jJS$3lH}UL?-pZ zM@wt#`sWk*YnmVyeq51mAWr1D*sMQehHjiL*Y7`GsR+++<8y}(-27$u|287T$WJJr z?f&;4K*Q#nS$g@O{WI=LRmuk8kn^J9c8Xl6*gB1J#V_Kugg%HFNt zEX&OCH5|YRtHoL9EDcLsF??s%mG-T1X=4*cwE z*mVy4I(;ydNzw$Q1z@139}c=pwBaxuc= zFfCnCEP3{kEWn1~CO#zYH8!orS3QXVtR2{_XO$<<r$6T{Xv#AMY--GN8Yx>&+H) z{!J<_$0s5Vkt_hN1cM_m$czEkUj_M&K47gK7vzwukqGOH^D?4}jTXu?xaT{Mvj{$A zcT3^yvaHhxiEp0)lhg?zEQ?oVR--(G`^hXp>OGlI7iAr|!hP3njJQ zl8c5+k-PhjbnYxe;2Ykpq3=##l=5YuTb|mk)YPc6Zq6|^h@A8j{T0hOqOI76FLv_Q zHC>=vI}ed0MEH<0VVAQIeyE?ohzVxrzqpAwx|fivb$!V*EcBvzZe+X>CZJ0k^E#Pr!B$BfEVt~?`uZQ#9i|c#<8B}B zX%P>s1qGYT$ft|-wm|bhFO4@h&eD7!|AB~hqgpmVPg2pSH#;HkamOd#qDhv1=tJ1C zokhfAe+tup>Rdu%Vn6WP%$zosv%tG%09cZ={#hyQPQcJaXxHKVW>NG?U=K12#sIm{ zj@^6XAj6nnLM;>Umyw20byq|_HDBYS-k&x$&>BA=9*xMZBqb#c`k{LCz;fxQ((Wju zO{e@Rq4GD(sE?7{km~w>dgpX zVppuhhE<{uauu}%>_)d?FKS*@&;kk46$T4$KP<+ceF*(rzTy9q&jL_uJ-iFjO@ zAxa#lGB(r^LM|evS|EDe{4vnD5aO;aXsi=55I25-=)%HY(?isG^?BtQ5X+^p5~Y(Lmb?&ts;U~cGS}Zz24u6R2joK>a9Uoy(6eF!j6gVM+RXIk zB!UG;D^XsI8a3yzdDUU;lkeL1l`*S~po7Fau#M2b`d`B?*TD#qL_$;X0kfbmYL4n8 zO-4-(Wa&uI$-SoMZvk$SK`|oyjmy-crKg`si2;={EkuVDHH!$^r2eLKEZuMw>}KeA zxkOC|3bM$eX=YQ4wm#9DS@#=$mf0tme9MQ?@p}!OdR~4IsLvI?uv{5b6goD@{RFs7 zb0D?SOjnS)@f3KztMe|q{iwE?A1B_MURHbhy(ayIM&Q_q;>xF(aL@}eC5Zpz#rsv02h4JSfSRLv=^0=>ar14A9)bp>|LinGj;~1q3%2(m9SR^2muuP|6 zz4dFZLCo)_*Jyk=cyVS)Wh-^v> zR`t=Dlig!JhHSBt6P0~d@0PD}B}IbhYbvkOYa`U{_duDnKb1u=0=`H+KMB&CU@QW< zMCseL2qBvOEyy$#NgARwQo~eZRxD~6CQ{l)M@I)+!=B#PqzU1Ar&!=zvHu_rXl_QJ z=$T%QbvMkT`n*QEf||l)qOV^Boj}LglBE_a9*x{b3ipT_PpknaZ4T(SHZjG1aoe0P z%YZk!r22018t7l$=U;^eio)*AlCku8#eRFGXAb0;av=65)PG~R+4ZH!qJmO&|H_P_josr7>)l}{ z7u*UY?BxmZTPEfg@(fkNgLvi!uqK$QtlH<7f5AoUfh6vL^9GE3$0(k$Cv zMJw539-_4HXA>hRv%j8+S9DE3znzxTV%gPCYDk9ePFJ6Cf3lLHNbWKoYJm9Os7mqvB|9eZF?t9j;Eni4goj75pKI1hc@`Tp7khom%!OB0VI zb&8RuAxa@LANvW94nFfl=@yoAj3xx07AhFo+@up~uT!369AgNMpxDNdZ1*B%L zRPcs6vF~B)n=84He6WzS27W_&BSo}yYM0ZM#xnI@!KYY0?i1yloitCmCcEo-0m!U=Uhli`ic6(d63+xHr3b%Qp?>OZ%aAoJYg> zI6A+`DYd0`qA?w2u@yg@@G3+Pnlj`cDY!qpO_p{Raf6?@;%#A<$X~e z>DchgB=aeF(IN&nyH9B4Ch>0`@brxfLhIEjId*)=9klM#=)#OK;?xlG)+$b%sM~b2u5ACbHDbmH% zZeTMmE3IJrFiG0#n0uFd6FCcM+8S-AJCk`Ul&%!7#??|gG^+U$7^~B->?#HpwrHB@ zJ*X)IMU%B0#-78(s9+moKDX&E<3-?<pM%bvE8x3_lArFTDE^;L<$~*>nDQeFl;Qm|13m_` zI&!=l0NA9bWw1Rb9NAK zUKdyZhC-nsFYF5DcGdHsQq#z9#zV&Pn|PzcOwKsNyhrkBU%7?W)TCzHYqVb!s%stW z&3qiN;xc_!N~N?fYHE0RRYtM_E%RP9|!3(x&HMKRpN#UlJ+a*#t@nByYuHTXy zpQR&}Lyvv656+In1$Pcn>`CCS5w)9^56048;Y8uoPYD|b1n}6@k@IVzrDwXo`k;@* zDXe63deZ1}J-ok?C8kA`pK?txBux~xe@1X3BEYQu7D^?xJeOVWfik9^;fjiGrf;8L z(ulHCVVC~R(k!y@2cPc^B<=K*deIQmb@|7^gjgvEy+-;PR;Cqq%}{EooDQJ;#5>#V z5gC@Lz#~Szm|CeCpkSy*nwobex@YX&MRc~Vj}b$xoN9n!3g;S-ux$euNaNP}jLzlX z%kz{MMO4=&$RhM+&;gSLr2Sozx1DP8(y{Wb<~%8_(H{UO_5YaCNn`O(eD&a7*`co z;O)va*6@m$V?1Kk&$z$0OWmZj%@{2Bym9@z2hTmJoX?=|x8!=)+J@p{zDdp|6m{62 zZY2M#pWBW2)NE{XJ~`C^hRogNo2G#~m!-Zvv7mFJ^%%TGpY_|XN&P+oPty3qCv-pG zy_-N7IM_*j;Zs$j=_75Y;pK`@ znm3lYoouC$N`cZfIyv2oiMDTecb3;%X_M;sEF`q}YpBl{NpsIR(#b2%o}JF0ty1C@ zMND~A?+F*uNN+L@IS<0t-|X3+{Xi6R_exio_^G0eIHW&fT~=PW;7UwVUtE%^T4`xs zl&Lf|b`l5gSu9lcUhUGPFVz&+_!zDmbm^PcmoP2I4U}K~c6hPn(UJsH@ASuKJgZHY zOkVe!b(YC6JZgX zltIVlE4K2*wh1XM9J9PDY`!53imir3 zIix{dEpeGIO!B(qk~SeFEQrhn5(3e1&10)r9nM^okpE2+qVRivasiGCcM_FB`g1Sl z-1pE+21U_I7!M$XLh1Y z_&9rZ=O}43&5>PDc?9~xxu5choVj^P^EO^b7}DUpnI3GbMKiwGahM=a_50=$O`qEV z#kzEJ^%<@TrM8Uo>QuTZlEn|D=um;FSA0w(GKi3xcdW|;i!AUZ(MN-~d$?}!WRGz;aoUy|?H zX$T4QK3DM}#d6`rJHBK$q80JN#&o>7O`0xxl~~FWEH(SR2!YReeR5qnoc=U^bouVe zRJS7c%T z?^?#w5`CRUr^{W|J@-J_k_gMI_9*q>kzA-2VRoRwaLt{p z$vxu71uizW{8krl&f*p4#0LC4oVTtrx|CdzzGJS_Ssi$Zmiv6O9yvXgj-AbnKHYAH zlGB3=eHiK!w&&ChldGNsq~WId8^9yxZv}7tQe=v5AbY2EwEss=>yf)S^&w&7OHjjE zq2dT?^e{O)mCQF4eAJ}-#d?r`S>;gqk1oz#e{Nn^0<)DyXE5YdyRw{4D^vXhlGh?y z##^2PH4HZkGBVmUx>i%R6lad3Qd3Wqrkv~V!Q>pyDAlL0eVspt#wriMfqfe=M1at4 zbm%sP=_LcW-X3YEDoyMVZ3a&2GIHx*?S^TtKHomUj}^)&sFW3)K9fqJ%UoU@P_LX@ z*Z%2sqJdqT(ltw^)3bg9MzwW&aw&J3GjWXA6zi9}wIba3%qn+=dfpzcc}A>ZT-HvY zQp4rhsN5$-&8pX1m%Glr8uceGmilg}0MS$gQ<69#m&iQXe4Vk@Xk7q9+rx_n%B9h zeef#}U~WS_UZ3}%8}AZCi5+mC?54Cy=K2C1-hCUY!oI9D)vVc`9Z*^fJu`+cadDh+ z>ZD1)Y9Az~W{=jGzWMx67WAtY-xcil{IYgYA;Bs;+jtah)?38!{&Hh|RuPanR*LLe z`Bn5RTAIBE>+gw}fOFK_*s_7OvSt6$m8nGAjvBiDu8Alk0Dxsq6{M7Mz@x)yv*TX( z_?w%vJ5js1qJm{P2{-7L-xH*f7NLnZ+wrv-Z3607D)R~<<`}X_j+Ia#sDR-3;@y$e-(wUBOACJpoKk$$PF ziaq7c21#aWVdHHNlNi#Ui08&g4R)&Ip43U(O;|ZpkR4w0mS^WMJ$aNTi?fz3H`m-3pYLZD7dhHJp2=!CSOz zez+WGB1IZCRs_BJ+moMZQyTDTv&Qt79{ay~vLrKO_0`k1iimE@gt8$aRk1gZ^|tbg zVb|=G^EP*(RuyG`Vrz(2c-dx)KiD-^Z$7N5Lk=4+XR6QaSs-G}6+^xZ>^5QG~f|kSHwI=Y(AyH7#xxm2W`Xkb{3vpImXvC9pPY@e7rLQ-8Bf~XLERX zXUnP>(uTQ8+qnVVcZND6!4VNBH!XLb&hQ()n;XoGAuFa29vSx}D>i7#jQ>6=W%BVz zB^oLRB^gKK_J9z?Z$*#?T&`ajPqkcK6M8DBm(>paDl z<0>bd6kw3xO$n2i)L8Ry&*VvR$|(vP-=wh(wmaHybN>VGLvId`{dA|;-Ki?|cBK>j zfx)7Zk}qDJ6l`ag4*BA4$^*~-*TU;#0k)m;5`jJEEAaw?p`qB-@3n)3fngEZnF*_R zIaHp-+{8GDiOG$wY1tvWaT8>pc`_dZ+kxU$cJG>&=3K!I`A*sC9*j{wa5{q{Z2N{) zBdGICK2EGS2LrywMqx`A%sbM}G3!?#^~rV5Rlnepqy5_wg7TX|OYYd?jzx>>o7;`6 zvjdeJ7WdiK@U$rV;4X!r?Q;6sz~+KR%O<^VCm(M%x3pjpEp4Q#wh2{AjLYs0(#EaY ztRSs_?D22kE@%wr6kpcLR}Fr;)^TqYwqzAa)i6ENLBrH8OTEFX9h>iia3RNp;ve4^ zN;E;wSZ{srK3{jf?98*0%y2&TTk^NImOKV2Tu20e%?N)c& z{Kd4-dpg^-8G{Qvs_**pOYEXEg&YU3N}UwUE;-u#y+H*4fryWSCx6YgMc2vR-8$MY z=P^jr3$Z*n)2-5Ol<`gdjk=6zG7a?^+OBYPKa3T2Q$wD;insAXaJ_5qV;tF}-9l7= z>De`l4HEkRmdmq>t@x${DISX-3zqq#uHU!-@huxVdHC1+j3)Sbn;N*m(qc9+nvcy% zsbj?Q&cfl`U&E!eEsHA&B^7YjH*rEZqXHe6fT3rQ*IiI&A?G8NL~j1wa`xe!9IRgN zIV(EV2bi$#t+oSV3Z9YK7nXH+(WCyu)YDHzH%T`KrAo(7_zKUS2k4(Kspqi(W~>r_ z#faNtvd#Pit0WaqSG zSG#O;vGj+;1M5sN@rah)La>l_4hsoIJ?0nDjT;gTk@Q%T12wdoR<VI8OB-lpHMFFuW28-TIX+CbT(Tq zHqN9T*38ok4UfUM;DmiywGvO6l?9ZQ|DX-XEB)onD@!J_`aJKI9=VwHMJ$9b~X7ZenD3)ZGwloPq<*23>kFux3I!31|+>UTOxI%jZdLllRWU0~^Qgut%@A^Mo8 z^V8`RXD6xYj>Ozz&b7G?*t1qTJ{0at;-(!Q*KHq}O5f-|M_M{BI2DvFpqso___%`| z4b9gk02HncySSu+Xif5(N}ye|;T%%!1V~2lOx0H}CQ~T(W1k(pjyrO7l6fry%#u){ z+_SC1K+2J#zlM8tiSxo|U|x{TlW+m?u9v~CfR9^dyotn7^EYJ&SR?K9*$4L|R-ByC znf+L^{IOFvS!9`=@M>t0#|ojIgKo?@icL9p%cbp1WIKB{w|zAa(`P`bzo20h5gkd< zDL)Cv;Z=aSlt&2yqk$~OzY>XbPtUC`DbU;+UWfZ*oEVXPv zAOr(m7at_&5YGb;t=EIM&Akcd%@da&Jp>PgyqfB2T-{sNIe2q!f9bBTeTX8u2~<_O z!Iun5T;b|qlKQ}vE}3qFhWaGef}#`^kA+FUDsXgecn+yNery`9?ljo-h$e03eNe@` z?^PkqE}3$CY&>TxY}bJks+R0~V)q;O7e^dB)c!lou^aGZaMJUBnh z9|(r2E7Isd(6NAX8@FmV03?|MU5s4hv9yvhWYsD>2NJ|N;JAR>fAFN7_7PMpm%tFf z4;T=TUiU+fMAGWqEEL|DjBB~T#l{?+o035b}DLIa^ih3QRghl4$p&& z{8+JH0U6N>&ucRhulWEw_n$B2*A~ABV0xzn6DFGqn;eEdHH9dnZ8vW(0|<-1ZlC-xn>>t|9FizuDD6`c4yUM1;P2P$3=}wt=7TiEQVl`gI?p zKFMTJP4R8@#Dli&wwp(LA-5hcdC)uvx*v4fsibHgX^^wf1<|Yg6F@?KB~@Ci_X9S6 z!PE{}A(LR8?&?6pXModlayCRd0(LADpl0ZZdLgpZbcuJ?AX#8P${`g-_0yq7)hw}d zPx+4#eKxIc!JhDx7D%b+BhaA#FkGCJ6uC>f#W*csY9~}S?YO3Fi*>wav~Jlb*=!OT zKQxdf#2zO-*_AqCE?`**gtA~E!Gq$zj&Tw(>Enody3lN9m971HHV_j)Xgd$c9X(i+e*F|~#W$(jNw2)bWro&0I+xJuA7j>=tlhhii4XQNVNMV|Gy2GjY zN>4%N)y$*$riUJF+b@m(1doBmtPw*qm7=Od_71Y>j*-#PxfY@MTr%E*ipckz-@e7r z6Q<-Ux!BOc<|PWWmy#oUcmFey}wY!*BqjyhzYBK zlM-shPy(fzDyHg|ON)}^dQyr&ZJap=&UtX=cYP2)l@YYP+TR-GFnv~N?9ENJZ!7V4 z1Jp7xSca4W#f4NkJ5X8X#VH;&V%nI#z+eEzNPxm_=C>(nWv4_TvPY}Ql8b=vwJ=CP z4bLITAB|DoT72!YRRwW)J-<9#1My&NnFIXX;?y=Efir#ClUwW{#{~v3WkS1%DIWN3 zjuBXtpNZ(f0@iB!s`w$QxVx*2COVa12c)Ykoy$fvt|O8xQ@?p|R)uzEr)3CiN-$d@ zT&@Y#3krL7_wzL7G(#W)1q{HVp{$ezzsH|P%mOlrE>O#qjGe;RN()&|LXm7O%$OP5*oGNYRCXE5FenPc z49XZx_}+8Q`@BErbox9#|G>8&dNk8(Ud#Qu@9Vnm>v~?#CqO|&@ojW`U7BWXu1$rU zymp;YB8wCGy&+HRI|pbKGlYPZ6NV61@JyR5Cf5;2|GZrUg2B~qC5TJK4s{LG6OO4L zK$VH;MLPQvb-UpzUbP{=iAShG+AuVmU!5YNB$&w=nyk6eTYf}G= zTQ@|@z`9)`>BlL{O{EC@Nmv_d7qKa!tpY_Xg0M%t*bEtt&5|bx-uP9UoIi`S$v=ZQ zpfT{VUAxcJ@NKCVAMwy5AWtJi7Q3Xl$=0JZ^ct4mK|jdT6T=JMP})k2%2nXJs@|cL z7!dBRxLB~_xD(Lrns&mu+A67w!WHFs@(GTyRf)Na2N^B6DD7gn=Fyp6GSpQhc;a$f z#;3zoJs-I@upt%F`3643v+^xQ4J3ngz@hUd%Ljn+W_W5P%riai4r5lGxHuG1TE1%X zc>~~Va$<9Mv{v?6awj?JLOq`i)%isuyw=H-n%+T3g80zg_G*-EVWh41Wd!Or<=fD4+=S8;_G48;tP9?Qle%vdJx)5T#)Zz5)HeKQxYz zgyaj_wNTq>zmJAkhf2Jcc{gZb;MCF{PHHH@Yg9-x$4S$8lA~M^g9-e>D<}ag8Q=6Q z`2y2gR82g@lE?-M7vubB$lfdOT!Jmq`lQCwc4k3-qQ7>Ty3g>8K_v0`@7935Fg~d2 z-o338K$D8l&l4V437b9>MXN7R;mJxmeIo59Ms{6%P&4lfl60gepAGJ@KmWiv;8}?IsE{8_DH?@1qLV9IbD0h zDR85Hv$fYQSRar2Y%@l*FCb&+J!g3hUTdG7Wmn3>%7KTw{BAicnZGLKq1tOpK2|j6tzKu7R=v4#24JCXUl`7y2|KEIrsj zAJ#EmgguP7g4^M&O%V&n(r?FHfqk^M-Cs5!|KB9bo~v*Ew*w+2)T%#TU`ax zv$B}9f&${5!{avW6(FHDjv0cg@d0aqcF`$JB=((WZeE&vsG~Yuv2R{C4#*4E3uM36Yfb zHBh;VLD8!^O@)BTET94Z3UOBg7QcI=TWqdI^LA}IGOe& zyaUpC1^ZeGn0pMs-QXC%KxFNn>H3)QDo}S0fVu^q1vMSsBEEu6uO_;jev?_Ur4 zS77??Jjg?|UKu#{y%qRt3h7(*Dkb6`NyINc+n{~5bq^=?*%$C^ zwq&n&KYdqu`7=9Wf#=s5ch&{;kNC!mkd>_CwcG(8$|FfX!?hgHZ0g?(UYVA(ONI9Y z2aZKw!7+(X43N?{y~4B!7LG|?Xr<6$+aSeLwy7b3O(-oCr9)B{yq=Sgj!(>R*EeM`2}L!92} zqdki^1|W9j?1#Do^})F{!fiytjqUHYb)$`{bSn(Lf>~X}pd#(kl~JwmNjnAJdi_+L z_E_*=7v@dNR2qK)D%7+q%+ z@hc7)t|hWw72~-+RMN~UU-(eHI5n6(Inb!k$bQ({b?SZ!Q2M4rPv5_L>R)$Kkfhkh z9{2wH!ujuFk`w3aTd%on9#ZraAPf|a9phACXo%vOUpz-L2XA+;`Hj>Vc5sby%lYT( z1(ncH#q3w6EPf%C=9&Icwq^mMlN0XtvPE1xB}cjsY7AI$7fO~U;;#+;g$2+-A9)@$ z?LYrf#zO^c{1=<{@%`J1fQ||i`J8tscAeS2lX8`L4)L>(aI@%Vl`#xZ8G`6cYzfuh za0H7P6zC%d-Cf+-8Sx6%2QOlGaMrdzzGF2lfVeq(>EpW|KqnGWzX7<0g?9&M@P^_k z);~5Ir(vSOvI#%A9|MV~fcs~qFDi~AHsA^HI`k=AJGK7=_djeX7eN5_~QKgr+L@`=FkB2yv$}huG(hRZ((FYQxSlL~Vz3UOxm<1oN4qMdD3chB6sbt z%r9SBg8Sw^fVO7$8?)p=v+2%;!?p9bw3j-6K^#zccV_5Xe%AHZVhA9At3J7XW+Q%V z>H_(l1JkDGgrnGSXsupmYL9?)`Q%vAi&D~zi^8YiO^q!7{Lgyd+qpl?o{bcrbTD=z z=woND7Pbm1VPXl8WnKP#8XAICU~1WQjgwL2VM=S3m-@o=Wo#dw$i-8JX@D@AIyQ!6 z4pj#OFbZvAeU(5TdXeM=NH>hAg(;*u;o{eOFIci35ff=oH&tj1l(1?-hD&B}4gt;W zR`R^Qn%5rV2Y+Z3`u8WNxM%7vzv{lVoyeS;iY@K&?tZm*yEICbGeUXg4cdbBZ}`hT zuvscn@Kl1=t-7xam^&8ER-)Q)RlR-NB#vOuYztcfJuBlvD*bJ%D|Q4HUCAWH5;b&X z(6g^X_ONLwPftE{VbZPAG58VHvATQa&Vk@XK@Ri0XsLjdbrR3X_~9pP1o6%HQ;>8C zp9{ezaJP`VE_@#<_q&BGIfkS9yn4S^4UTcF1CLuawv+Y$&(i{p;+A`L4oFyAPa1DOwA8eHb4$ zr*ECkdhsNJw*B;6{Sdm~3rT+?_Y-G?$zK1~tobHNgf+fotzA1i!1a7~LTj*PI_9@=d}XRi*hB+zvMr3mJB17)cUzMx5^7kQgCmu?^L!PJUbty-*Z@XKcq>`# z$@Cuk%F(e<<>D2s6Yr0M`COjv3|=o4D^m+t3bqrb8?OBL^=3sWkO1j{D$vx`7_4~o zX&iLr@139LfnK>9i;n}mc?drMRXU{8JJN`XrA7F^lSG%$+gIH~hjsBe4)OMJfHxr% zV4MV+63WtD!rIz3nI|QB^KYx3UYX2Vts>(2QLtgM(cOsN#fbUbP}gi<>3T11*auB= z8i0&z~Bz6JKXH_@DA6ljdCax{%E;L5;MkQITK}OIlimhDNgXzOw zb^z_c-J6BgTG}gaC7u4Q%UBZnI5r+jr;{cS!Prm->U$km1XRI48KX}mbZ(!i%X)OR z&?OQX0yH$yOH(g;14q!zZztM|BZDVh-x9qB7Z01xbOcQw(blFfcC=UQVOsN5tQ87W zEo%5Uqh#=F!z-9(XqA~%makhi%gqACBK&1_Kr8WW*A~(?LDEas{i8BJ^nWHG;lx75 zi#rfCgZu|mZLpt%AC`H1%L`(OYsGI z6i3K7*llZrRdC>|0%?No2unF9!joq>CokbdiOW6|E*{x;Fcp^eaPo?T=9T2tulLUt ztz@WHcPPv$_scr!`p`4Dk9}hEg2e7%8^b88jxGU>tdGv@Ip^#jLQDRP7Gy5+YXD%s z^YGMc-Q}>YfY&5TD_v)X)vKg_(l$!+)bi783#;f#AN^9lvG#!ZB(9pQ*5QnyiYvdj zt&9nCgL{u~-MW6xJ->Pwi}6bKC2RX6f4JQIMf%BCQ=V;;^mJI`x_J4Hpb--ftWAPu zc&+f=1(v%5zRg#ED2ti6wJpHm?$Y9Za)yJkLVt@xZlIfp{mKSM?iZwaIV(}`w2F!xGqv#y1)BXr6w52;X!x}yKYT>i)}Kl(|&N}oiB2FT{3*GJ;%wd#>IBf%W?xD zi1@_i_z9=kFMC0zby4DSp(wDIsUMH~@A}~q8i!x8ddGHe>R*}2uH>QL1ko!O*!>q; zoQdyxPk*_CT$?-)Pa5SP97}U3$`HPJ?#@Ib>#m5B$!g6;+2=iL3uOu==>8%FvW)B`)W zHe^HN|K;uFi-Q{Y9aMdDJd~U~d8m8UOf_O_bpVokYa46g7U0n$Za-bza}l5Lwmy3I z``lkp1v3umS2|Yg+=5(3yhj*@N=9%YjL_on@G-w<5(SI-tMZe*WU*pw_aH~GuZGM# z>)OdutXVA~w&@Gl(eCWrO|MaP3n_GgeJI-qAhOlF?``_wTuT?=Ru!^z6@QRRHLkgQR+36XQFKoCRYc^OXjk~Up;WJd9XfzHKHI+cA}G4w z)w_PHR*|m}mB7Klt+D`hM!sehNBC%O^U94yLLSB@;6bs`^)X!+}Z(| za2}45Lpyr>hxWA5o=5VbuEQpqO;Wuk4yJ}$;H<)*5e<-NLXLfOC7F|Ayb`of6?tBN zIk8s}pL*6!$y7ATYd&=t!$K5G?kSkQj>I0y$*!`0_$rw3K-KFpHQ#)S^%X1$s0YO^~1bFsc;=r-4Sn9yk)bb5)SSMi5 zm~gC;eD#r*DEDR&O%A!)PqXQ`T6&(6m?%LldNddU@)cLZ?N7>k0IDwieF@OdrT{hB zjo&l&UBBgjT-$k=jdm6wHu7lC_;lvNfj3Y3;rT+!wDjHLcV;hqbBBc?1$6NB4kDL4 z=eg9c*Hfi-{%KXZ%!e=z== zWBqin7|<0+X!t+H*_AbI-oEb&(D_pOdf@Wq%Yy*eaw$dDyn+Ps9@_yFr$OMILZGdS3HHR`QR>@==z&4K?uy}2NJp2ubfx#A?N!V-T z^JcFbu=$c|=`qp&y{msLrl?J#!NQ;>PX>xC529infM5Kqaog0ToK;RNSAFB-@ z%fX%c0;$VWyy7KKgodZ1QX@Fg_$J%@VK*j7nzsQM0LBZHYULO`}|b=D3D6-V|g7QQThM z3ZKZ$ai9nQtfk_YZ$QH`%=Y@@`}coyA_WJnJ{$Y^F>XltJ|a2wa6vdgNznU1=OMZE zwk~=lkcSO(5=!M^4e59CFNZuTyS=ahkWPKNc=;c1bOV3xt~af%<-CUTi1pPFasPv| zvbR0^W@NW+vLP<>Kve+Ib{Xh3mhn`A7<~lk%?I4KY4+5+Kz29aT>snq4Dc{-_4%=!7;dsJvXn<<~>66ip{Vgt|pzc^A zfvRdCzOhKAkwI`5BGR58WLnbLu|;iP>U6wE-PUGeP5Pe)j_uHZ{ep zdnej`_nRMON+_tpWo@s*7NlNuBEm+{N6? zUz5NlhzF*^i&wey{QRpM=lVpJ?el0qzs+vkO7L z{iFt(jY&Y%<_l_j?b641{I>g;0k-3Bzx_6x3nZ+6b8k)z68jqZZKkXv7!M@P`$lfb zs-Q!Eou0am%wLrR@`fyJd0)zFjY>L>+YIOofLhK~&@1ghre>%i8O|GTONCzPysf|@ zgJ$O6X31PU_95W3%3A%JylYdtH99k*knj1whEk=W3;1jokB+(Dq10LfGCW< z)WC3p(2A1~b~_ArsF2*tePJH~GCcXLTOR=Z!N-W^!1qTVn#|L)*x&^_5vx$ADggTo z4#V-LKjsYxa($K4QF$FOjL0V=ifh7`{l%X%hE6W$QwpD%&rjY;H@H;S*J7k&&*R5Z zcbgc0oAR|0MHKDYvmh@`Kj_t&6HT&wo+&`P-M$>JLN4YNm9xeCr)OJuP|U=9_BYc_ zl9z|7{6Jf3$GD{R{;vRrI+|GM>&O@Ghte$QsHEk{-2}>Xxn3<#y$UnM;ibe$pJ6ca zA9~+!+me-HunKNmoSPvY;xaWg#X=(pW%bX1PNAB2|It1x*|{L`)@9M~Hc?K{u9cCK zCmLC$79QE(85JYAoOfsBirB3ymKS&kpuTQ{^?BRLHmmEfwb}Kx4yY0gaCR^cJNS0A zV2DuLN5W)3qTO`_3!)eB)Fcxx+E@EO6H*QeCv-gLN z+W6vECplkv@{f@~UlV`aakdTq3<$Ux+h!H*-n+iiM;YP(9A$92-QeV9qK74Q5s-2) zrEGHubM(rqUE>+YyI!2`>2QR4FbgO^H6=8GmJlI|R0F~rQ#YK#eRky-kN+XXAEO=b z0yvbW`H{rHu8}kV3w{|(4?X{(n%@c5x;E4!;0Uo-2Jp?+PvT|;=Wy5M+D z=2jkyOF%zB>I`uoVVcCo6{~E0f}Bn;ZF5(58t?lcLm3xi55x%=#!!g6c@OmD&KQ^q#OX5a)EUu!98F-m^%oMirp}#rz%J)9k7nV z?N|XMQ9%=7RLHhEfX93@)PecAw`K+7{An-6pdSqypolK>&=`#%Orn~-S}&5RcBn+L z!o@Cejl^EV%cx|d3ET!PO-)*Y<^Br$l!A%MHL_e_pT--LK7%u%j?)b7DPUUtna-jt zfv;CZVrTc}qqaW|V^u3(u}v@*>t0<6ICkE9*3hGYQeSnrB_-EE>vfR@DeBp0MujJA2iIOB#GqOga)Yp#IM|s$Lm(@~3_t zOdW77;P0F5zlCY*Asb4Yk5KdA_3LXK3ceY4c(MDY7wwF+zVM6U@`8wR%q0Yr7xPdf z;B(au1L+3c$($RGKK4*GXa#*?Ly%!|xwDiIzT`$Vx~s)x5I{gyJ*m<;VkMS+TI|V` zI}4_`%>)0o=cHKz2{?b0BV!r!qz2%-9%GYjAhLpb(7l*=eo^8P%lbTwaV#DF7?@Q@ z-{3d zi;)oarR{19i~FDJT+bh|jiuU)I!>d#s2E83DS)^|IW$`dB_9I`GT_LEN;YrZGuqx~ zyN+4C(^TMjDZzMuNsJ$HTQ#6ZqQB7krZIUTbi{HN3AuVnW4`rPU=S)x&IFD~ED_$q zc0K`6SR4oxka`s-5$ATT(z_BiT)+Cw^H`qy$`eMYbPp#~x|e&uy{^CrX%)D2K%p40 zi0r1zSv(|Ad}eut1Eau?EtL2` ztSQ~`Xz2sCfOw3F{b&}Q=ptG#XJaAe=9&b>9$KlYGJrvQ85*%bglEBWxpCzqW6w$l znSUST<~g$RW({DdCJYWi6EJJEvyHAj>=ZnKIs)SsT9_yc% zFS6=zWDar~pt<}gd**hV+`>4WpQf+D+fEKV@?HnDH{)7(1raE<8bIO2dkV#FgrXD+ zy4}+q{qKJ5?iW)lg!c$c?FBUF?;XQnOyhV3y&y8LkwNH=D7k>5B*dQz>AB^{*b}Qy zO&66pij`8X+H?o>hTX%NIv_S|5Oz3gsBbeY%MQbhe2oF*GKN%c+Tk^7O{fYr;Uqh6 zF?t4{$b5;4ZL<%wwoL7ZpJ7Yk1aAQR<@Dj9p4m$KOu1J!cZxE93pEFk(Mx(oXp-$5 zQ?UVhT|}mZQfxP1UF~Ou;q@CwW*2G=!iZ{J+&gPib;chsw8AALzKX?M&#SN|O^OrI zuzTbt2-2yrNixl$`O$HYxG>$GuzXBuP?=Q&u{|tg^*vffMF6_1Xw$bsZghgC?H59R znazUHiNnC_{|3krp9{UFb~w%wHIqIL&kWJYqTi-{p7LL#Ur7GlM=L)wx#n!4LNGlL z4g4=XXxr!To(!YD{8I!G;(7TW&kj8ce)v|=yvZ@|TB4wDMln9lDu;(KJt7cOi%UMe!rO>1uKM&WX7_E!&M7v74|lizQXkHunk^p=QvVnDT`Oov%YLMXsVVtW-w zduQ>eTKOgDXwkglTyNNmVakHebMqUMRhTcmT%ti3MJ}p$mV`=MM(l;BLmyaJI#C!Y zy>wP8L4S4v-b1V#9tkQlo$V;?3L6AFs>#J%_sLafF#y%tU!m~&Q{kLlq;OBXgIg_z z9CW(QW!Y$!qe*VOs*Rrr3NV=E095)o$|VFS#QUD9`Dt1_>vkhL;TQM&&$hBT=O*$g zwlx|#nPMx0PuW3ac!JoeX$@Q%DKiRA3!>i`NKeF4F0W)_)ZA(ci_p_P2~A2Ob*vt& z<9>88`0cS8KS$t>P>q`-_!WC7JvZsOyFDVmn<~$CTO7@gbA^elEK$}@;Np9y410fV+rV@kjt!?rWIyJ$c9E`Yp9~#Gcie zQD!aK@D(Q01Po!audM5H&%P7Yd~^Q8@2#Turt)Xt#bR;O2iA|Tvs_|eH`;!?}ddai8|Ry?C=R2yCROrY7s}a&Rl*AVhbTk zy?Cz!Fdqk?ZFrs0(4@|U<7@$2*P@ci!;fOZ>cpou=b;*>&^4oZoUbD0r@QJZ8Jn|E z1qyEvs+GfDt9yc+H3dxj3*1H3ZtFLoz0k4EnksC|be|+C?ek)-@ZWo&8*O+dTlpj)hT%&a;0EV1%_sv=XuXiXn}dMlFi{3j zEd1B-b6ZV|3*M=kHWlO`a%1>6KoSbRh|wHZ!w(e(+haxGw?w~SJk0?fq>b?qa}x17 zzj3?L%2H(voV|uf&NJ~a^+woGpJneTDvl`EjEq%>Tf?Uzi7hiPSsxS4MSh=p9)*!t zWz`ouaBYZ0h1EbZ{GH7jM97qbD9ESx*HKgH*3DOUVutIhsG z__;_)8-4sN)PN_GR!&iRE&FZK_88h#=2lVM1_9}+%p0NR z-_{pdGOBS(TVMKqkTQ5IsH`VCWFXUi@W958^(E~ft0Xj8ErZhEr?Cx=mpQRBP#B(G zSp_?$K7WS!3jd|>_va~$fG3Q=Ndj=p!#$M9E9Wif-H&?+(O zk=XJWo0&nsc4WqRMM|)d)l%yjF3iWj2~hg!MQz~yqX^208#d_S?^DBK;P_9AFgzs4 z`9Q%mV>&S2(=V^*dbbpL<#t(Ku>KT+(^*+ls1Pa_dMQK^vT%@cmUDKsigGZTZ*Onk zb5Hb^sEoLR=-lFLV_pm!Q>|rAy=U!lEwvB&d3TQZE|+pX+9t?1!cv!3D2{|8pd#^W z;s>)s9&aZ_UJ8%xesq5I?F+TT35vHH7axbR&cmYtC!+fB!GfC8>5a2#3zttp8Ij+X z{I;56wi0OJK0dRJ$u(&LvHdW=Wki=fT3uk?RG zLE|3Ow7!dGCyuVPAD$uWcy$>5`TYMGh;_K=Vy`;s&_{Mw-Q?tr>uy&hf*{<5Y-XCR z?I&B;K|c^kmKfABn(Q-0dl8V+!{J@a?~TPiK=I;vxW)tUTl`3h{>;qzitZ5QN52F0Eown`-BcAm+#Zh44z63bU| zQ3(g5GQa1Zt^7#owx1%g3#BcH5d4x+& zuY8?&U&T|2sz%m`deG*-Plz2db6I*~Eii(T27zu=XUx?bw)B~ZnP{BW3Sr%YnqlQI z0=q`Z8~-JYQEeZ@0lEIBG)eTs*;5)CKVj2^5~pLilU@18s_=1DpO|d4(kYc8*5#G; zm$!wRCz$gql!|-dnlAf&GHola+m5YJoH1<##LwdFeCE7;sLhmP2ffT#KVY?1Xvhni zQyjx)_z>+AaIWwXEzA=_s=gHDWxjuL;T!#w+%*+fm8~jKI;2Mz`to)kMJ6KjA{GlG zQMwLELu#+FfGepm9f>qO+!r$!y0z~_1;NK8laGQ2lm9}TUgY~w^Ct)mdhPW9d24RR zX`5h`#qNwfe_tDEhb)M}sdQq~L$!krja#QH?Y+{}#*5mF#X^uh6(mv^O69BRqks>Q zNOM4EtBR^e1YL6rW}c<(uvurCR$DdcN7augcbdLM?5zWolC@{U*Vk5G1L`L6u&bu` z{#bqgnSCw09;$^OaaEd8z7X`EuXJ}8C{bI|RhsKb!*u3EagW6PMigb+-&XDpnWI9{ z3hzD7eF4(jCC_dbhwV;plM&?2`i+}(!k3P2rh{I+z3Q8WX*61tB`$JHBa5A>dEoDl zxWzHLzK`J29LJ(vDsSk$pYG9b7Bc0AdD$ccgXeu88A=B9ASLaUOq@SzB zeE$2&iH5<;w~`We3kwV1zH#G*TV!M;Xu)aXzwBx9=dIQgwrn9q8nt!&=ehp#t@O@Q zIyzE-GgW5@6iQBMYVK7J2>k1iT|0uqB>(#GhpDRf-b4;xK7KlE(R&HxIg0IZw@4wg z?U!aYoBX+q9dS{`Tu*dX*58M9Q?J|{bFlIQ=p1?pQozj#6&010eb<6t{QRf!`+P)Z z=E+y;iQs?E|NmIV9eZFm>CRjJG2(yym(0;HJ~z&cN-l4>|6xe}JUps+(CY_Z^KR~v z3b^R%p_Bi34gYf%@aWtA=hNR$UOxlwr;Kv*@Q*GC&fJJ1{4eKxSQk|swwT`e<44gO am3HzJ-h5t3pnn7SbJobr5O3fb^Zx+AX}-z; literal 101303 zcmd43WmuH`w>AujFiJ>E=g=iccf-&rEeZojmwTS z=gAVSYeBL0v+w5*a!ysiXd=wynYWX& z-d^DQ_291d_rroQI1~EruCELVv%Y{nh4{m*-gXwEv;Ge+s6#?79j)v#op@FC*vnxt3P zcCneZq@<9AhiVvZtCML^^r39$o6quAxgJdu>l=mp>+?f}?sP>0ORZrHc#M-je^ao9dgwL`_gGBUvkM`=4QVD^1O-bCS#+ zk6_GZqaTFB!MRE=BOInjy*a)RJ28A!FMk&jQrMBHV|QOyyk`p%C18Si$pzF@*+qL@ zu+dLo3(5p#qJwPMi`?k^l`OCOgv>juPsT<(R&C{4NhP*?`Lflc;`oZvP64vtcxdwKB`Nx zv&k!y5kf%F;WYZxT8jb6WnVJf&pej46s|i8J0tvL)7sIQL=atLXuZw=qMw}KuI``M zS7!&!G%A>P7@N`)&SILpMU3_-tP*Vfg^)51VBnmq%YHLKt*L>YrdpDsVCAtGMexk4 zlxfqw(bUWq3%!l6x06ldJ$?C1)C9M93Y4SR4Zpr%kB%dP{3?zsP|5g!hx@@|Oo6nq z-?j{H6(?vvXZ=r9Q2zl*q^ulB`a1>U=!W4kGT-;$(;pPXe>63sc#`A#ixk!Qt+&SW z_S^AB^`tm0+#g8`Vhj`&Q4t2|#h_qL6sCz8Vx{Kafk$>wdUhofgu)gkuAp<= z^auJ0T8Cjo?>~nrp^ob>)7|@oHu~N2GsbMA9l4F@-rhts!iMUEi%cbto754^O$KSb z)a&(h2N_c{`ukw~sGHEzU#%pVqHl1ENdwN_clT6V#Yf1ZaS<~2y{2{jd!*Sg8G)au z9GMU>;TFn{a#g0Qv^ks#j{@vi|0PG_?cJHy_OFt7@|iqY32UWvK`lIvCK>+UD1k*p znvdye`#kr*bF?|=_fo)f_FNlXtA#onJsN4Y1OzUfayp*p`S>TM_q;-LDGKt`X5-+2 z+m46VG6q2$#!6=s^iT*atY^oZW!3(z-aL!1}M!S9B$=__1|?uA`w)u=vkJds3f@9GS)AnnlWSdNr#dwXSgdA<|5}UPcj5o77XmtX>o8tU;R~g z=XJ5u1YiHz$3l?yHhUJay)9q&{2{N#TD%Qde=0hQ*(AVlA zmm4ai;-1s(H-iXSiT2mX7x|l0Aad*YgN&4bcQ70me!JkJSoGBA3=x?ixTu!*^`TzC58#FAGAA|$o) zqljrBX5U$=4+g0JaMhs%tS%T`S?=krEZK8e!6!v`m}0M0_|*7H14}d)Gc(!0EIWlk zw>DDwM4fK-ouAHrx~sDo0q_5mr>}Iej4s>ZO!D+i1Q5d;dKu4uVTF1v?pLV6E5=cx zPMwZ>mJ(t2!QL;~2ja!d%y2&I1dTi#R)`92u?lFt%7|Gw3_Edk&vs(}&6qf+Yplq) zB?vPWxC-aB$^!J)dDFXO{GdEJ+&BcY7TD?N9sL9fueyP%4Bc-9l|+gl^r0mzGG+0! z1=nPr=6lcBYt+=Fo9kUTw?&lv_Z#OQU-fIDE8~UEreWA!t?zbnB%Oa+eYw)qzH+%3 z!NB8aRbOMgjx9=3fBG15^m+e;+@v#VionYXLq^E-TQY6%^NWK-v+FZlUq4>6&;lNt zO?MxXY##<$K|#f$tLOqE90+LL=eC+=yr5ApkBkSlnn7-Lw^OFM^+Vd{Ak=9;jhD3; zYVc!{31$_~sI&^B0e;Q<>7NMRr^w%v*lQ}(sF+9lnY_e!DEO2~J&EIoq38EIW#g8T zIG87UCt;NE(f#=$ddm!&7sFZXS_iGKU!~jH%`09btmIOQ1{4BceKLRw0_AD~L+T%p zSiAvxVXA^pupGQIgv+6WcMaTA95GtgUat`D=`K}CQ)v|UfnJoe6e z(n2JbEVBnjJA(>4LtZN0MlcR?97Jp64=-owU5DeY{=IXDP;&Rgcee>K&@sx7c3i%l zO-yB`Qciit?XmQ*JoDxCLA#-6hVy4#^Crv9y77{YJ*K7Q7dofKt!JW(&mvFg!*}w!;op5s0&ymqy zmLE;?3dC=}uD6~ik$?SI(!*o7<3csqkJ!q>_?&L3KE(tWMC8EJbVMfS!=-(M)wqoF zve#ITj;)w)>*V3mV*V&vymHIk0OQ6YA?tcBnLR9>vr*#h^h0*yL&?C^CgM6pSO0g{8nR<}e@1~Dd;5YX-9ZBZG@sL?KeRWp_M?%XoZdUOCy$5jdL9k!AaF1btD7x6c$B zaJgLO)X>nV`5p8E+Vgs>Un>Y*X-rG)+>dy!K5=BRCH=>Xoy~3W3YAF5U$M_y9M+pfp zeN9c;=~xQZ?1!VE*1tcbxj*b^L&9T|$BI-i6hxDhF;L@hU}%q(GaOf8ciH5m<6ejw zYG55p37>3f=xMNd340r?7!B|6Z7UGHD$osT790P?ADVpo z)}IbMF6|F*$1O*-xkntpbx3MB!1LfIoDu)~JZN-CsYLUoiC`e?c?337{4YLUwqV$5_H;08Ir8;i^n?fffgoyW-v9+os>`H_)|uyOY5`Kb@s+F3_4)`Ip&7Xr{A1N#)&vlI*{C!mM_u zmBzKyrh0OJiZP?;(87K-RIOLNWw&d;R?J&nD-&t2c@mx~_ruArA7QPLUr-2{>yHDH zE$_9bvx*G0%P+Qp6ka6*>sp#~|F3A$Gfdp-C(g6svy z`zv^2GNX;}jmqO6=+UqND&TO_p_t%LHF|RUEZOaC#iFSm46U1WqJXYtKYrxi!mmHP zO&WdM2rMm?7M#8(vP|UV7p_d{D&4&vJ0tCsfLTrD$~jGz(3S8nmqXL0=_(l~c~BKk zPR*x-UMMqo&s(w{j=#eNEn}9U2CnbQfKNGZ zy>_*^D=uQ%E{y(ph*zM5IICnMJXnfVkNh_Od#S=dVgNLDHoiRYEt?qM8sk3);|8&F zsLE^O79WH5u2wN-1)<>y0^|obq=fVlJq&x(V>WCkl0`a^q~<1Q9SK00tar7kx~X7f zLg9THkhj>faTv+Thw3FuwA@rnlf}xNP?-uoYuv324>muwYp@DE3@2~uw?P1%8NbDx z=|BEKodYU}C1)0lB!Z0FH9&&aVouBU9DJ{2?TPtS9ZzlbI@%H9M-qa^;dJXM^>!P@ z(%5nH49c)Nzk6o!(R9WL`_(U>3JX?Y4E@ zN32``01>0IrMmqXPQ{G~FV&!RoFCg<#LvxRjsg z9Ks6Eea=d(sX45@&cF;zmjg8b$7a3ey>qkrQKrjOHq~rR1Hk90F3D+*e_5PS zkVFxo_jw}t1u>4(H`qw+yKXd~@3vn@bgqOWJp$A3NY73eDl zdul3B+uYwh$9&H$2+&Li4aB!Xe5KV8l92}k}n@tZ(33D2H-mlD#+dT5&rLn>87On(yweKaEp96_J$Lhy^%V-W7J-p0)&$Y< z1SH)>6_|dgpD5Qk48T`Fu|E!@WO3S`g8(6^Cql*+kkn!tazqkow`ibw!K~d)Wo-9e zMS}VR)WGRY{T@xsz?Mn4+0Yy2r8YzGx5%xOzG!-d7Ln8>E!LZJDs9!Z0YpHGMN<~3=Clkwth|BzIl#C>JF2A=wgq1l2@1ky`P zvx}miG=;-uOic^IU41_y1l$gIIw}Om$A{&=$57bcND30Bp1O}_mrSHe)PfDhi#13& zx!5X`=BC1PU{+p49gMSDMM^(MGl2tO(lj!<#(x*7Y70IFTCwx0HiI2EA&zf_HckMO zg!2KlrnVN$>uRGA&NFwvujbc3bm+}N5i6w3vHL8Sh^A#P0PXFN7HX*kwck%#NVbrU zW?xJ+Q@Qh-3iz32whphNOmrF_-vTa^%i+!s$yID&HgCc*C#Tg9OAC$eQU%NJj$FaWi_S)qf5&L3)uN4IPdR6c zkvMp(2=g57cqMeV@ZhKXQ!FNSJs{uQ%2TtlPQa{kx%&|wRo>v^ zW7mzwd~97QcI|IAqPSuHW{W})zja*5-t6n!L?+cgML=S?62GM$sI}Y;7Fw+o0(_G0 zw`v6ZM`5Ig*r(llE90-Wewhz$m$Rkn8Gfg;ltYKN;`gw?k#zcpn?sdq#e$H-sgKjf z?gQx#?aawW&JoI4B7z^VuyjNvcm0XVfyl!RlK=?z+e>lSVt1(0K$a*CG75La!L#p? zGO`fue7W(XA~JA&8;hD0O8c8U&|iNE*ElO4ooBpMXv%6OI-qrt^8{%7`J0~N32w1# zh39;Bn$wjAu8G!`(h>d|dFA?aO^2;!le95*^F7UWO*Zo7T74aZF&NrZ>P6tQ9dpxy zV|{iJ+&@3_!^#aCR5==dqj+3ze+l~4uRev3!2Zzr=qgSi)F|a3iTR(qc+A%}O8pW{ zbQ8t6QWfu|tsHk%G`EhrrGKGSC;(?HBA+8Pz(YSNLd7UrlSB)Uy;l57A561?TUx%* z2^(r2woOC#O>tFzIPn765X6o`*yJSm(7^&PD?MC14?pn{*ZCw8V8 zUxATa={S6_U92TPE*JHx5s#DJV>BbYOAp{70Uk{v2lvxf$4;k4z8oZ5%m?}-NAiZy zZzbgA;0wS-?trrSzB}}3^Y!zgd4yH`$Njk}Z@<0j?}K@=k%5}-#XHXw*XkZIYq%aN zid@k?34s%eCfJNn*7zd-n~ko-r=Q_fZ~Il z-{GQ-QNJlZ{z!+}W#U#w+fa|fT$2{L*M-7nxxQRtpjWcR(PWuEU^;7I6o2z3Gu2zCFEf2FzFbf9j1Y?F@F1`M#IoSF@Ot}K6;PXozgYIQk=>yWn_w; zmLf!!r8dO{5vSI#Ex!ymTPuR0XyB$2g$!BNTEGR4+#;-h(iYf_|JTErqIQKS=ZBC= zt5j(}J|6WHbtSoBy*3Fe^NgNBG6#sqNO(=y9osF@p#8rp*!$RTFHO8@rcM0X2nY!W z3Lx?8<4T;1RU>7(RY^k}Ey~yezmfFYy_;m@<-^KNINQ$Pm=r>jlk&>7B(E({3l9Kq zPUy4Aee~w)M{iaVzT#n?YZESz(3UAml>>rJc?fFt%F!R7T1dy;tjvKce^UlH(oQ6( zPn2on9#JB1=_y>)mZ4PS2`G-%34Ryu(y8q_*W(KG2W#NN;!?!$2%U@rrB$H$gBkw@>a zee@1H@#;tdON$VKXn@Z32`0tAKyzQ<_^Cn~hOje81rB5{R-!Qu@ZoKp(tj90CX~2v zB(Cu+uLD8Bm0RS4$CMR8FBz zHu3dP-+2LKB&$}U$GhQ1N{lc};1tWsD=8~}=4o{Cm*KZ1u6|MofLd!5lof>{=G-^< zm2CS#|6mLC79479169?S;S52oQSk@whRlcUFWJgbh+h?kL!Sv*1F>S*U#f=H8x;iI3OY8E(%vR2exB&=b~ACwsQ; zi_EHZ^0Bk4G=JQt1=49|hU+i=M50sR;-Lg)E$CFKjwX<5D1_a3tiP#d3V9f|QC2-Z zioZlOJtix}HE_$F>jkGhTD42|<7 z?!Ktd+ZUSuAoIsiY{n5d`yu^Qv&7?cvo9pRgN)|`wb$hVA*ghJz9D@t6{ZJRw1Dkx z>kOC~gdlwRa%gY1b_1w(yc}N;VFFggKl-A{7;7y?xJg7`vgv&_nd)(AIsavd6|rdz zrJ%nl<~7;ryo=X!)-~~|6Z!i)$iAf=I)G<2RhmfCUcTt}aL14uROWiPWXzzJ#rEvk zvl}1z2f0{EyS|=T>X$T|M)=aGp5(N-B3%**A>n^_tHxzyZjPJ__S37r_bAv7B^544 zMDyymP@nF?|Iq>zs#3&%>c3U$7MJN#pHAcZh0To>VxSz=L+%aZXW=sQ)z*gi!&R(uhCuZKm-zkPd?SyIL;xT%LLd-sXg`kCY)#>C zwz!Gd!>voc>CH+QRn$#s)p^d0G`ky6lG5E8=YV^!&zQv~T8yM6GGDgeo+xcEwz$vc zf^ENvy6w-^0XP$|s>|juv+m@>-SR_XoGUp-6J)mdGa_e$f!g4AxBr)9e7%1h=@eIy~m`MeOCSpM_>X?dP) zwoaz*?B(g^NPHZfJZTpO%J!U16)($cg|8C;m(ZAEVPP@Y@o?vm|F-sxo*vUyhD%Ju z#;;sy^a*qjJLdQx-1iKFFopA6;+tfGxus=#4?0-Fk=Mwis(FVcA~rU8+4p9|JKAuQ zwW?LqjVX90Gq&s2GFXiQkP1xv+cEuld%d9xAVL@RDa)Prc1cY?u|9H8o0Iq_P=OetzL77ck#$ zZy>EpO9N6zFgCzUNrlz@DgYvt7autRmu*koc{o#8S6!ueiX3&x>rix!CDe;Nm9)E0 zJ#9}bt0&vLCVZmkW9mE+htYg}Vw2O)3))HM;x~VdHEAEgZh9}rIz&T5|E1YpV2j7; zgO5o^dk+Jn(q3*@(yx;A6-pC494Q>>M&mq0^XaJYQ19#0#vDa#aMH}O-@|@VdCl>- z0EHB0VDyX?anIaLwb_WJw%JyudltF(QD=?o?i+lUDDGH+Rgm<%_wAF?^;!ySd;1N` zNoz5}_8nPz?xSzpJfA@@U(KBiadxDPeaZ4HrD3ZOz5lx=XD8n(bfB}>N9K^ACU({1 zEoUX~m&Rjdx?q#-du{btTff6zhB8Oh7Nq||L1_w9uGLE_SfT- zi1#IO0y%W6+c8AwPfn1vy`|0jv4iGa4v*6jew8Cv)7LsJ=H`&T;Afg&ecSYT45iOI z$>%b6ldeWtG%s*1l9*f$)Kc@H%i?xdm5K>9Ld=mKEfWi(D}flO&T5Oz7af|mp+oO* z1%=%Y5=JwHlX2xok7pcL)=A|mo>n8gF74MWp8f23pe_OJsj>09mkRas8!--O@7Rbk zt=s;7C{#)w1Oomob^#U{A8{}Fey4eoSM12=i~afZb>x{G$)Kp465g9zS&w?(JI}SQ zh%W51VotC&^}O{&kxJQ}8@9}o695XFt#s30RknZ=m!(6A$5|Glwji6Uy>f&vCz|UB z2#FNO|NLC}rEI4bOs6h9z++n6+ozbRnpw~pJuL5hm=1(a_J^PzoccZ+>OOveY%Gn?t55D32ljCgZ)j@EBiUbK#xKYDqRy&1g-+ZH_=ei{dtEDUKz4= zyyF4trT(-CI0haTVVX*@M7~c2x+&&UeE=s%Sjrie#}x#iK6}K{SJQS4iwwn+Y)+Q3 z!dL6@(cB|qWPJ7uOqN+5MOUPks&LkvH*}p{E@PsWrZWM8GKPwZp9P8FF+DiEy?j)D ztu#NVOIPRWFfe2dWM@vi6}(O$`58mTEpJv;MKnm!SKbygo0Z)iQ3e<&6_FI1nt~4s z@ab2sZDjNETIJQo9*4nIkIXG=#B3xne~6DLh}5(fMK?MaiD(&|%kKTOj7s zrRjqr=GMvPx~q8NXQpjc-!Wdc*?w~n51{Pzd6t36&E48cgOkYo1KP9nJz$0Kj5-91WjsI;V zN3%lv!~M;jr_Uhsmz0rbW&N}u@y@@`7GgE`KeJZ88q?6u6)`O1Hrb(3C0BgC6DZ-r zYqU=DQ=?e27}u;4XrmY`NI$_)11_3LcLe*T`z0c2(m4*>Z|wDo4a>U5vk$RSFRr$L z*_vEpr~xk?<|^mPf(yO-h9YEdT5Yk5?)?wif@oZoA`)JGBsi!C%$vBrF+>aPN2Gq1 zSbyZd)?+g3jc{LqpyoQns0(N`TVHGZ*XQd!o@S`nj6oPie0G1m%{<%%W&*`b@c4L@ zs4OBVZDd58-F*E3&}i6kkLtH<7o|s)<~9#|XhhuLVhBAR+YUvzCedxcD^uQF3(5Bv zKpXOmD_(>vJVearIz?JdXdCQtKBX%v$jGlCSbcFC}J33`Jjra9TH2!z<5^u?vp(YJHLn%H$zn8n)nSpKnl02nW~qC3v6 z3*V2)Va&=o&7+P-2-jWzvTs|i1m}ul z2G(D-TLz*ZJu-7O=8^j3wM-25B$9(|Tnd=l>}(4pR0`F7MpZh*J@Y?urAIV+RJ++L zn%R8%ail-vlQC;XZE0QKUaj_z66t)=5w(Uj;8-(Ns28scQnK6teNGfDCLD`4+LBF% z5c;dqxCLN26GIoepYpzbtwqH7*4F6Yj*Cy_F92`NLwX$#rHXRPyJd-3IXL(tp0Ru` zQHbJMop_JHg~wPzOilAVvb+I!CKn(ds=gq<$b5CXt;;k3{_Tw32_P!7hd-x5@y*zz?4Z zDOdU%MQXjTS!rwYUku26lQ0y^Dn(sQRs4!c@Njr^V@I2t9V(qeD2sy7lX~c!0;PORUY-+ zw?G^^bsv@IwiM4Xoid?Z81FB3t2zcv{O-(&1}j5bUd}$Km*J`n7)qA}VURebg4z$r zZ(H+f94Wj2T4;E>VZk|4V?sL_7rLc)8)9HB2B$%m z*C3YNz-^E1c8X^!A=aZ4pX}?2R%>w8*fGxC=;cg9Bk&ooe+f{HvCP#eac>RBwT_9! zuP(OG7wNw)nXW72IaoHyTePLtd4q-OWj&S!=XNXiK)*3-;eJie%CBpmE}-!^MP)=d zaHda;uYVDLNLX&L=atPjNEdO;DcBpr*;DfwR#U$CIa@n5GC@UszTo_*Q-G!|;eD3q z$8nS|GT%CgtMEI7-v)NU>gf3R$pyqB&O*m#+rjRRl=XhtT|5`3$d*aIwzM9DU(u`N{12qn`lO>C; zc31T&cJN)#%PU#?8azFc^xX&RY^F)4KP)f{fBZxJa6R2IeAlm5oTbEP$mkj`(n&_Q zjWVDtc6%g!b$7Wu!hm`F+tFNfA2BiP;5X&Ega|Xcw4Vb5I$RcDn5hX&9ScTj!W32< zIwIDpI>41?@>ak=R2T9W1yVHIo-xx<^>KF+LV38{QSRk;f7hVwxk2c$1xQN@^q1nPa7C(&d8JMynAi6H{-7i(+4$`hUewytE?ig?gcw@p*G1bCQZI~vM@Wx@#}9U zn(ge9eI|v*o9Svv>qQ|$jiKotnm&cFnuuEE5EWfygGRW^*fu(H5iNiXbE26B9iO(G zjVl%UJ=`b4)0xJI)e_7f`?_aR*ze`CH8Ncf7G~KzHDXtit|qnALJ&h|L#beY2{Sgw zr-ro_(N6ef96`#F(DMkEiPGJ{h&@ja`@H+!jl}ZIID6s4cEgh-_V~VrWqZEND8iRY z1|7bS8gN90xXEgaVBea(0XI7=-6rg`|+xwp}NvB0Ur_nshsiv`8}KTe7L=#&5)hN|1he)eAm6=b<*3ApWn$Yj%6uoHU~ zy4nEd9+LI?`(eRTfDlO}b7<87n(`s;OVzFBXIarV#&_4}cYTd{D8H&q92V}{uXk8o zKr9iEc!pFV4cm9nGX&}^(9=gL(bxy#@Ek-%xi!)VHjvTx0_+v1`0Y`*@0SGt&u}~b z{#Nslze6h^1}0F%a}TXSZGeDS9c;J93oVd$Im(t}D1?VU;g}6f^?NtH>qI$q0Ldu) zK&SHl0X+=T>nVI*h!G4PAPowFJr!LgBpF%Z#;6kP{-}+tEj>gTbiBOpO)pt&e*7b1*k}i+Gug=uSk}f(*H`zZ* zJ4jwzD%NtDfpg_w#I}6M#wwnCf~)3K)lCxy-1VdAfq?#3&VvjaGf!*^7IC`;rVg=( z)}HB#T_?juiVvnNL<9{;8Wszp9NMqsk~mE`?B)<$`sxES(`{bvF1L^D z9UwCf-3y%Q-n>mlqURt%z7_`>6leWAkcjU9eR4{QB2y!1PlYPnlHc#%+eOT2Up)&K zBbv<%@qV&c!x>=UlOlVy0(?O{c zF^JC}8VKoY25QAAiF1>o2y)atmneY`jo#BTYj>kw?Y)yFnumx9kpt750Yq1u*T&IO z+XlgETny<(Jw-Am6~}U=3Fj^m8x9=*uUU`wuyL&%e!|};d0B>!{0AU`G?c;xk)WiF zy#7B?W&$cW-=@2=nf@$3NWk#Tub^m<3z%xkQ+R)cc=-Y1E9V|bbch+=bJQQ0xfi#m z@P{EzNh}V`fDl$nu-47RK9&ZdLI~;e+S*YeB?W&8qJE%*o$Qg5PSk0zHM~HVG@u`YQz)?$DEY!B+zI z*;GK)$?!U8l0)9IX1bT_S}!fDWYL-s1vt~b z%wK6E2uK(DoHeARt4nI4513PY#|g2__NfOb74*Pe)_ppVfFdvVN4$p^GxZd8D*N(L zG%oqC4*;j4y5xJa?{Az4^ufHJr4&Ss__@M|AE~nwyw$@;>dZ$1Sz4K%zy~KB`a-sC zGcGZqKMBZVU~v-zK;3oxCBsye?pgnG@>>>qugk#zKmn^UIv`%kBmhioA=v^y8$Hmv zq$&Ut1Jda~Q%c57C|sZFzGAKtt#XXr_19JxkyzImzYuvxP=*dHH~f6t7;nfOee@#W zlNg{6*iY0QJl>xhNaq+!zwMIp*b8}ysyh|~NURS z0PTf|JhDmhPlTf2df%qWQ^s-n-a2)-Nq`Z;M-tmkgUrrTvzAIjMJjjJWumDNZVvL! z(a8`pdXhiXZ>a+*z4og85!q7ZLn{I3lpiLQitEZH!(DfOjlkGvfELx#H2pn}0!%Bh z_fE=Oe&H2?W$|}_DvFAT9Db zoUNZVoM%CeCKSM28Dv4k>nV>kN4?$r;8Z|59a`tdpRzAW+*!kA#U?C3@`H3m#4LjF zSHD>xUmfQxu=X7HW$OBxpx6cXuSW&)jV1v8!C3kgrO(;Vt}TgK03b(w#>qPcGCY58 zUTF|hNPqA*7d|r1=m&Z*IOTpqT#XG_&nOsO2~)bOG#;YJ`j0{jz4f8J2Szb1J(MbH z=q!E(rnWcICeb{H7-bC1e1WEg1yzLYhart=P9lomd)S8x;X=NK76`o7a#PFZ5EO~c z%PZLSxrB7HR-Kg!Wif8!IMLH1(@YfK{${O&y@5cS2LO)wQqER6YBv1Fbl>2Lk#)$n zy^Y%wfpD1UE30o7CVuL5-4z0eZvX}LJ3#WNGQiVDMi2m_$(a?AMsuCfM)=&gUG3G5 zMH%XqBFCE^QBHdL-op#6Z%+WdA7|@P zpay3X1CoSm$aN`DQsow(YM7ZN>vLn7z6c!4lpM{0OA&^W0rKXkF?5QL>WTXet~f){ z0yd+enwYM{pKy|+WRO$nC&I((?H8MCwA8zC52L?5;6NM2#>H1Vf68M9VmLP zh5mKn%VPX3Bf^V${xX=L&%H4qFSxsWJUHb;kDLpif*64*^iW39GTQRxr?_=w?oq>_ zD3Lt?(*cC26q_Ky2OOIyDj8~GJ!z+KVOE(63X=uV%mHPG-L1bJoJ}$p_DgJb?YEkF zt&XvS9y`nf#_b~Spau$4jDP@tt}5Hfo&H%s8trcklwt*s(H4gM*@)2por z#sUFhL(>-`>TtQJqX8^ZZuZjwM*4?ole;WBJ)^nv19kwLG>2>#(8a(8+?TAlJ<+-n zfOhq>fm~CQ!ZSl(yvk5re7F~S^AD{96Rhu!;y zAXX>B8Q?ioRrx;ifXLiGGY-DfP&h$a(5A7Y?xC(uL2>qzV$d;%y@9E~!h2L*_y%h$ zvqul$Ny@d~SG%^o?dR+5qLrAeQ4`)X4O7pAvxnwk5|S>jRYF9#R)66}V$#OlXLxL; znveMFnIT8vKEW~_CpE_;<>_bI$(XY7J}jSciNQsu3BQhw$_3bnvgecy%WFxW^I)Is zu}FDd4N5t>$E4O_$r6eBAzw*YeyJ!?5DNpR5)-IyCZSuZa7_ ziJ@~`{0r3j99&ZS{$K`he?^<;!2W}eLoX^dNwCxD?H@XbQ-q;@J}1(?kvgurc^b<^ zpl5*9%4h%AkLs@W-A=TZRN}s^&w46|?tx`4v++3rBx^2D;r;Vz^DZUM0d#yuMf8{a z(U~M9&Kt!VHBR$<9$pIqf)_ue?@*X5r&tEOAAGVI276|^C|>a5gVoI608_?CPR&UN zT(4U7wrzL?^^|dx#6As2d31+PO-OF|6@gBpxU;XDTRroOIwEu28w`|ZPf4r2ZTnP| z3|6`zwp`)Nn1%R*wcn}y7J8I~foWf~7Yhfhct|~33M#pp8|>o7>S}jbR@H93)7|e2 z!Q^w4*nfH)tVD3Wk_O^H^rk8YL4y=(pAjeoN*LO@X(l)qcPf1IFU25yX};>b@hk>M z<8hLTu$>);*)w&OYoHAZStcvp04Q=UG%b4OBkV2A11G6Rua;|8ghYJWtlWl8hWdD- z7~Pz@hc26jkp_97RPn({M>atIhl^u@@0RbU@z6HETbc(zV0~rtOW~tbAmY;wbA<4P zo|-DVXi<&?cVA3m;Q8(oU`timXFNB^R_t`xK)&eswyTh|!Lri0Z05}Sx6OSVYz=4g zH#9qrw{03s3u!rjga5K(W1&fN>vz!I4)@@BV61o&Il;(l%WIt^(W}Y(JNoku`{T&I zSbi5IQ?{c0CH8FJ%gojq+tk~y?}IX2Z;wxgf8_e&Tf?lH#P0~c%E%%kOJGEN_%A%n zp2zvl&KpVtP=U~BU1|fH0R+$yeC=ygLPbe-hWm6lolN{K zjLsbgy6uht){r+C7fOhV$-**Pdl23Zdj*qVsx-nvK^uSml(gRLJGS)oPmW0o9I2ka zl21e85dMwOo~^E@_`=%-@tDV9a4fnv`iB63920-w@srleP3yYDYf$#hhu0j;;wTp* z`hB;p!SDSSf_Tthnhxfo)3<9C)BQxL1ZYL-pFvd|J3X>=ot;V# z^zjkN7G@1i7f+}4!sROIhFNfFG)d^x%7HbIsO<^*T|lY#Ig32R`CJ15A(n&pfqqErEX8H-Bpc^-*id!}6+X7@f)6pbmMuLeG8UtTd^SSP5XPesPs?y%S+;|Ur2guy`>|;mK1fMz~ z0Maa;LMpv7{RJMbVX2Mb8>qUFZ?lpCxbz_-op`Iw*CXgRkAaK1LI*218w!6+^- zV0?$Qd5~ZWGbq0MHUfWB-wG)0NOV711}Q*a`!BbX7Lb3Mu`R+$X-v(d>@p_w-e%R4 z5HJXu2_c&OM+*R>4<#SuQ!{d;85>8#P5`J8OpjmdSzZP0@fhX0`!DcPiCE4f*8HD< z>nz6%@|D+`8C7XHmfxBOobTj;4uhJq)!nm9LehkId8e2hX>MtDJ%~Q#d14(d>5ezU z@ve3!wN!qM@(m@YFW-{_ixI3<;bu1%{`Ay;UfXAA`3bQSj}u0F}PhZb~Hi?&OfKkSXAl z7on@CRZ);5albPz`?y4S4T!|+)6bhf#$9h^?Q)?8&`x~fhouy#+LmLc^A0hMQ4dlN z8`=G5Z{}(&)pAfqe2ZSO$U|h(Gw1QYFAzbwqn|P3^DqR&W#IN%Oa-!;CX3)jXWB8$ zvpv@%N1e09S<_Mz)d9x;#D7+OII5cGb!C;MU;h=hK(!W0ZK^ z%W7_Q{Z^?-QGk_TUs4h6rKY9^mJu8WKhC{t|CT?KL$ZV8d|InSjkp-$Urza}1Dja+ z5JjbrhL4Abdh#NNVD4Ax?15ju+gG?aYr-3QkvbXTL5FdfA?Vw5JP9b5cuscL9nGo(pa@p z6MAD~E?3S!>DpNGi_fgH^}O}80L-bTx#S^ldeL-z9VFfN=jWpk1KITyGw~YSQLHeI zh6(HOy$j5mnx=|+;>vNb*l~A}u8|_eH!Uaq3}ZV6ejfFDv=|RXyLPy3hFEfTw~Wj^ zkuanadA_nq?z!JpVOJ9PVxKkIp=D$(Y0h@}aa~6cf}7g@05pa*g6t})qSqf@GkyY; zRp$Lbf2$(u`M71o(6V)C*f2N`DDW_n*tm9f>G2u*-}_pOj(&RZkzTbbGKmc{dQpx7 z5E_9BMb%oL*HadaiN5zdiO}xu4Wi(jCT5JAq*`zSohdQAUeHFdE53xy)M~{ph#8dw zCPX+l?WOW?aEJ2$-0na^CC$++Cnq+3f-cugsX$mlKMGL z6;nnwe=)55=B2$WFSD2mK)sMSw9&7beCpgUBqzhCivqG86gxn^!?0Er&8k1^^YydN+}l6$9$4lRsH{Xdgti4 z-Y@LCNyElyoJJGdw$<2nV>LD#V`AI3(KKdb+cq0K=lgq~=lv^dWzDR0&pG#;+4tVp z^|=JAbTeeXAQT?e(1Sm&Sw_5e>EprZWo|}{mhO_zgxBCTzU7W<1!~QQv^~aWMVW^S zdRofsfPbKB@DeI?4t>gaU95VT+D~IL%QmZ!9?j%S4e9WayCwo6m-sNsS({|KL_Mgv zXfd)@Mmx^oj_^nHA7tu?d(?J9rwdgGKd%K+G}h0Zyr0YV1i2RuA2b%j>ppl?c77qE zxZl$;gdGRbewE-OTN@xY3de;y7CS>(OLxHD{fM^#P6n)zh8I1~liZSUTQN6?E!Gru z)sDGG6-YCLM}^Rh0ZUS*JfhTxvGD`7dL6FCVFQuai&WxK;c3ZU&cV}qp;`dlY>Srj) z195S0n7~sG`whOzOMn`}|FC2-NFRwKd1ie5rypS_aO)4F4JiDJzO;Zf^8{Q`Pmm z5vPmTlFH8E8x4~Aw@w4ol_809WetAZ*<66oC7QNmg5k#)=!m1brr)0=<69~! zS88SOS_Kr-P}6`;L)twTpa-~EW&G66xsPTt(w^vYwlpobl-9eO9BNGvunbHHL@kfO zX`>lD<)@mW#q9PB(*Lq8-kW#l&Bpe!OLCIGm1L480DFglVA;dtW_B>zoMoaAk2AyK zqC??*za&Hd1!&LQUUW1Wlbm0+9zC;v4#?N**aM7a$vd5VkAx8s;5}Sc&_*OGA@`I3 zZ5m(z!%kDzJ4sjps=}+_wkI9?F0=Z}1sId{YhY55G_k$Rw9$0xcPngx=>Bn&J%~TR zuFgWDxTJfy9(2=Gu4&x_p`-BfW27+=P?r_!6s7@gH4fwL&5gfcOBKqbc*_uj8gRbt zDu|*jWR8E|G`^IIdbvT;^{!QMrz1-XP>#Rdb2`4ZDH$tk%dVTmG7m{B#bZ|g6QA)i zpd=3onb{km1bpIs^VWqwtgX2B5d3s$+opOnw|_vT%l(0d7-34Qj?-sG|# zG4kgg-Mdl+px8-t0LpYxM+_T;7FQ3E;;%!C37rMJG8f?`5n=c&RfS?Fg>UqA#~r2$ zzY56h#w0ACo+;52GU&-&cK`bq=rD1X+!fe7qc<^UROlGHSSRf|oU(uRbEIqr1(T4) zYlqor$hYW)+tRkSUC*rg*LT~i;5#Pzse+mI^!cm|C?wZ(N)&SJAb%xjA&_o>SCizU z;ALX0v2mY?mpj=Z(+gzCbx0HJF$+yi@A`>H4yJM#EKNgRG-0u1FyfD8O z8XN@VGm#(N@$S$+@#5+66U7{^lezf04EyTqU>tl_gHb!sYKQ(x2e4ZqEP7iYG)We*4+1B9@{ZJ9~{e0WJz+e87$)H zuqX@9!l?!p@pGM4WV5cDJ~?}SH)UdgMtk;HrO+-V0m<%T&GlD*qn(-rV2Md5TW&az zQvDe<$DJR<=6mB9E)1cj3hE%jA@SPq?jT4@X8zfF_Iy=vW3uG{2cb<{FO*=}x#iAv z3n`Q)Bs4BXI_n@v7L?3uGA-+`l86H$g@eUryE+)sTK^m%jjt5ahPQ`RnC!E>Q@$^I zr6l#qShbZk(ee!Eo8Rko@wdehAtoa9?FELZ;YRn7(t&D3_d}H; zE{5-*=9uS+XQdEm3P4&nxJiB;G=QGb)-huQN0Byo-)1(3RWR-p?FhO3S$Kk30Z`vK zN+Tx4PuuSfMu&m|pGZjpdj_GB!hOXZ2OO-)3zB(d0i6rb+)njM6NBmxDPtf*U`p#- z!<|B{56?r)&1&QY)BiPHdF=UMz~;a5&vZSjyXgxzFMS z_gshWc|YX=GH|*&-$aa&-4a_3s8?~~kK3u}ialYVrJv#ZnuAwV zb3-9Hi#5KXC|nbfGj}VSo4V(LtA~Q4C!PZ`yK>viM z1<-()J?cFs0yKcm4l#nB5B*T(?e?6at%fODap2OVJDV*GgztUkdkr!|mF|>MlCMQI zCxOM47?d%XGo}(-THc0^e)%i{ry-0(~SYB0woB$8=ripuqKVR3JY%ouI%#p4!zU)HJs0Nz1c=!BPd{(T>1l$C8XRrMCL3 z_&~5cT{>}6Y!@F&E0A!srW-yv7QgcX5B9i#u5S^i4LA5`P)hy!ZBD(u-6f?o5YqDL z9$Wcxh<)eBif+X2Zqe2iGI+Kvvj1X@1m0R*CW~-2I=$-q3P*T3X~Peuk;c(lGw@hf zAak1u7+X%%50Pr85G3Y57raUCQnafEim{*)O`FNSx>X1nsU zXD3Xn&lw!f{73=%R!8BHEZkpbqDwf)3fYH|qAJCgy}gA=M8rr1tCVpwK5(D7X`znZ zGTWTfk3okzyweU0A8u{LzK+2_GwdW1A}0e=2PxrzP3GC*!LJ%6F@-?)jPm~6T8hPO zw4riM_t+OG`RV!fy1id4xqD;dAdUgF1$T~bxv4gnuGC6sm^kqa_IQSUhrCq~Pf}Yu zr?ayzDD6)HmK%E*n-n$X)`Gi2ULj@mDWTii_JgN>hiet!ZRj00@)+8l56nk|#=Yc` zVmKJF=}uVBxEB%+eT0kf-l=TIyFJ5xAg|AxO0-zz0c8IeEPbm;QVZPUb)_klYn=)6 zfZ>oyyqMaO$YiyWG zc)`ARQYa3W$|{HjAJ}&qJ8X75P%e8cC+!cjtqCYqX^r{ZyNuwec;p@%f-U0|I^{I! z)bB`d4oOGH##o^i>&?a{(%; zb$e1#gD3I_FY*D9^Uu(Ald!DeR#(xT0V6w=g3yL#(k|PSo+YlFOZYVPi>E<((7(op zs8#dV7lXp%35%B!^Cc0W2ql`g&-)Xn2-#Tt%ijU5nmgI@ zu6)&I8YU$x;6Ia_c`Uo>`x`DCD`R`h_uWk+P&8zzB-aj8wWjzTV}KI_O(C1}+s~#^ zQDR1quK}t~f+d_` z>iq)-Z!H!{0o`LJ=mfdb*^*ci4qXGqfL1%rPrH{2#-E1st!ug#p#C0)jE*H?V@KXJ z+sdl>;hb<`xj;QgK9t6wrj z`@%eg0G9Jb)k78NY#l+D*-}`UV!%Mq5^yGP*=0DkYWd}%Vo0XrMhs1^1(R%5UPF$X2)zC|JV{EG8;7pmHklgY-1;tg($@iyx764oDKkP)_{DdBraP#54 z+)I$>WB*SQf_n)}Kr7U7&Hm@vcnTFNq@A^jq{aS1FU6dwR>?`cNW(Hrs>_{b;Y4uj zVa>q@=j_ypUwLh3q2D|B6DVxt53KrBh|#yYhIRe9eSzTt(y_#HX(e~Im`o+M%^FL` z^kgwE4Tr`Q?x~aXdiA-XN{(}?0jkXJ`O;sHrHodtw4R3D6P-F$U!dM!RagLb(%M(r z*O)RKt4e3}$~b`QWF3OwURqZ*aj%A-23V?hYv`4gTQvZ`rqp&sP0^22b?W7gmboNn z0}cfZ7W(aC0WjB@0IRvzhR>$y@z3uz0G?yDW~Z!u$N2v+(L9QvQ^LQ#25 z+v?2q>J-zq`n2Mg+15+tmV}*=^+2I2x^9dPXP`;JZdco%T|mU4$n8@Q z3&muyV;s#ankKPI!kel}tniCKh#oEtf`aZUO_9q&07Xc~TzE#KBFP}xP&B-jJdfT<3u$+tD(A@2^Z8PnzqdnnD5DSX$ z(jjsmuasd-t5NEUq9mXSoRWX?T4YEI?0dfKA?K%+b-KBcTlvQy5;Az-;1~omL&2@Z z(`ZSCYFYBlr*X=KJ0}&8)$Ubmkx^fElLGg*@U%MN$tp_inD(J*Bq`F3 z{8BEA?%?%;!{o}>aQBND+oi_>g}Y^Kz@LurXVBm)KKl4_Y6C_?6gS@&tBnlxb(rUV ziH?b2OKI){7{Dk=Q&QOyoE{j$v|0GLThzZLB#}Q~cB+(L;jw`7b@@qVi04$>^l59>a$VY2h8x#KrG#oL zRmGKcO{LgxwfgzfurSE3*QeqK$%qnq|aE^ui)F6kH~i_t`*HLW%@nMT+;q(ns~ zWQi4FAS4Kwu8Y92n+dQ|L$edSrFe<`3_w{t%);G&6L^En*@?!X5CbO=2_+N)q0i z;Y_eB0}Y-l3JI$cVS9SX`4Y%?G1!&G1o^ZJO_Mi%lFMmnWRD zZqlJcDwYeO{u_;DV1C8jY#83gjRaSbg6F`_YmaLm^JWrh4}jMYAfdRh|336gqp|?| z3yrYlFwbFpg+L`nlio2)k?zdUOxcc#BjeXo2ADcP2mC*&>h{{4e`9pp=S$>+H=EM> z>5K9w3P7;spnCM0+w02CvA2BXni3Z&F90T4=Kd0l<&4QOT^S#cIbplTBi(m!_S`qv z=FCFA(pwyV3-b+MGgyjb4FZ2?14qUcbEn$M^hmIORWt&R_RUR$Z01vmu{#PsQV5wd$qS-bfb|s4^cI>SpZwjzOdGInbHE8OQWY)*uq4XSwJI`R?T8+4TP3w6-=Leqj^agX?;y33 z8FPgiuJtoIMgIwfL~vunRLvYi$jn{8aBaz4{e>PHLL)qtwM(>R1-QvfxJYOZL#)*Z z9?aym&Xm=P3LUExgv!tQ^IrYOYhhgDg5yGx+N?ZSTwPiGp}y7k;$Z{<);fH05)7gj zg|0Z?N!z+1QMjG@V+KRs1mcx&^a(srDU$YObyjxcSi<|WlN|9#Blrl1nyWF(^t~hH zOy0^I9V70UwY*Rhz57`ikN#tcYQENie71FvM2tj^mKC$4|9$J-Y2xX4jH{Iwwuy|l z7gd%@i}`{Zj(wP9u`-b(8-GX`tHZpZ$uy1?KBf=-5A@n{>tz`c%aodrRgS%G3ms&o z@KONbwUW2XkkIj<={k4Xmq!XH{y_`Zbz5Dwxym_NUcGtKHRsl`4)FWh$65j(w$iaS zOBE^h&sD*l#af8Iy-h6$oYL!uMgq~||BsG&dQ5HPOu(!(W8kf%{EkngDi5#^yd`@< zq-fOgV#LOUs&Q!o{qkQtU2GKdUX7gzT4ck>51-6A&`>XF;3(lZ zF)ha;(U-WNU0rAi%;lgOkJqj3lnw1RS_R9Tae!t zrI7Z`V~`@L;U$k>wzk5F>tztezyKh+Ssg4Z3R zsH?+KE$8ITCq{?abW4aYHH8|4nnvJ=7u~n>KM7O!xmACmH$C9!$ZNSqN`iI(0}QfJJNY*ypB@<)fgsr?w{2!D0M z+jn8B_{!)-*t9{h&FAcfMfwD5w*+0tcZ}QE!*G|_J%W{? zhOybozKg4RhSO#n)ou=4gtuTgXZ55V4#;B#5!AV!N%Np@9fOv2AbbpOIN|JBz5-fV zUr1_6Zq$-S2<4h~L>75vl_JNf?D=`9oKrKBtmhF*il^59EYD9dz_4^4EMS#bxCIIL z^wC61=hZbB`Z$;tMb|^}@OFJhI?~;L$7x?|n5gNIl$LoAu@aTgReU%3oc>tA&f!@_ z8q>z+?85PDsdQ)}@saY7= zt;8px4yC~kyY7bgiBNL)>x`E%%WBl`-0f(l2Sw&3uW5Z!GiyjV*sp;%`7gxxM9jMpMaU@_@m%#uY6;C6X6 z%YhGvl_O~MvCBh(xl4IcQD7djQ1H|KqV9bzy!hcJ#k9_+#aDN`dlIQYtmtY-T|oh+L_uvuY2lE$;>>mH!@~jy$23+J+)jUq3*B;XuZ@^qew~ZdcB`?==$F5|h~XMJ$%5N? z1e^%{{cl+EZdq&$)6QmGeAV#^j~c^qYQ-|Gd1p20{>*FRKgn(cmy{b8{~58;asbNF za_1xSFzx$P2d;_9%AkQWL!=ws;WBrWT9`8;{xBeWY4sSYuQQFtsT?O+x-z!ywReLD zYl`On2uf@SMnX=sHaU|AFK*&4qTX5+N6u#ZPBG+v&m2RgOaw$lx~-xlPZ>8necee>m09Qk5^Z=n zhIJg(N2NeYVcHN|6B9 zPh>red5s}r!@Pl%vg4%Hqu83Xn3qxCWw^n@dRU+U&Vc_Ymy61@s2yy(U^E;cH z_I-g1*NBIlXtQ}i%rn|8W40tdz~WlT;#-p;AzE+i@t9$-zreMRxlfmqwIN+D$)e`g zYnOSo7nGgl#FCM-Yvux{OE2IaN?lx2%B~F}&A5-ODnDLUDbLl$?`1l*n`XHt&&jV+ zgcH$oFwe=H3`$P|FO-;)ExItsD7*%@fG%og2z3WmXPwIxgu`VJlLN2yWl)`f(|2~PVi$}5qxV(GXHA)&Z z!V5(_$`dA_tFBlsqR&GqpQkW4(g3??!y%kGfxesh*=Pcpiy=R5osP+UL%X4z{~gA8 zSXH(2)sW9wshxW$>Do;0v*ULG+?uc+oGrQXfFC+)W;hx#p-AMGIemHMyZ=j%D8+$L zEl(RCdAU!3S#Os<%jP$dFD(-M=;U&8vT9;UT57iu+EohNRv+QR&X0Jca1Jjw1_Nx8 zv0&BOTN{mitXn0H(4|jK51PUOgCEPUCe?wX8SIeGNQ-V9!E62D81gpk2QT9z9*4V@ z5D-TU)%ebq__K5OM3=M^bK-H5A`6Ubq?N~gX zG=J_B1mv*`_<56O4_P$Tlp@WQa{1rCCv=L(%!a9Uw^6-@hZWFVeT%K=V7(bro$a>}ukTjf1)Qi{RC@9$p0NqU%_k2JxKdqQr90Xh)nmSJ^ZVx&LXY9huNV-q zfIKf`NzW8-Q%Z_~*&SmrgE`dvU~2Ojaw=nCW&_9Sn|UV$=((RW1srH!&emAWh&nMK z-FJl)@;ifj zU|({>8vA2%s0?fOHhGayu+SOBTmR;H3}tKKH%iMZ(*z4fd28cV=POKF zjhQzx7E+b!%!^~SFtnz0mJ{Lc002plp33qh{`qV3LaX+as2W|4fw;6{a^)>|rHlCd z;1&0%NmV%ax3NWE8ja@(E_h|n)i?sv0%)2=n=C)chusqsxqpvjc?v{;b6A{U$%w91 z6jm%6XWL5CR6saagfth;Din`>F51c=i-&2Ux+;>vH{D5`{hL-D1Kz8g zP%OuWtfHEF^432Z!7;2Z*d2=>zLSO?6oR5xvJTNpyFM_FYCjPlh0x>YDOLOGZhqxd zRR*12kNxcaxM4-P^r^%rPc%beDZYc#Ot8X!)BmxlIuo@ZmgOm5u2gUbKMW__8v+NH z2Ga}Nr6~-1I7=&OwGh_=F*B$U3Um?GfTX6ctVt-=3zbUtRGnzMobsL2`x#K1!AtVL zI4kc+$>K#xqtJhV?Zw~bxI$ShHrQ^97qQwTpSkRP(*#F|!3M7qqdaV^cHPrS#jkT4 z>=T^5`5^qEzmegDQDMwqLEeA%<(ci3wX`c-Fy{^q`BZPw@O?Fj%{#A~*ju_XBmVJT zlP?c|waVq(?IOyjQW@1G?y_@9!!~iiPh|1GQ4m$@uD#FJJ2ayAoBF!IrpZ~{ zz-Ayb*%!rKDSfkbXX#xT*#LH~g_GJuly+B=CKXm^DJA{;5G|Z|0waEkU0zsY&AFkm4U<26Q|8t~cqVNl;ZJ}y# z=Y~D8ronqshm*(=3)Vi0GA)_QIJjLa=D}&i((=IoEqX{cD3*X0F7YeGONo*XSmi0< z7N=Z8(7q-Ajdp);fn0Y0Nv+7}PL|&C`9b>pKoS9~gfCvV~H8 z!sk1gFK}Ff*l){Ew|mlRcoO;JOYQ8CU;Lcp1-?5mwkn2TsY!&mdhkmQoJSfc){O?y zu=&PIRA9d3+>EW9R(E{&u~g6Wa}xg}^FRv+2N~$k5AfM5~peAxn31_!}>1WJxQ&Ca~=`@ z>ZHaJN?goL#kE*=DWF<`W?oMgB>$d2_Bov2dj`!`JpYIm)%>iRfqxFzh+ZmEtN+~w)q^Pwox;uR2ymZ@v) zdY=$TXka9_swk8t+rl(bKPoWQt2WacRhyRO@Hy6%tq$}jCWcdG13nimtf{tm#}Bg} zY_eYbyI&dtRd#xKpg=gTx%T}4(# z`D-@?)H-w(9nE_;B4^%d%yrz%RtPO?W=<&yXyAHRBinKd8BD3*b2qH9`b?9WzP`|s zdff@NBFm@GmKA#+5=Pp8QL{u36tYC!x^-HJiFM?AEA&L^>iRu#QW6X-@xQD8zdz(8 z#HQ(SmN$skj!>oU+nlFk$leyq;riiCb90y=5Wy_{4jNi~SS({ke6=0teqX;t6iyzG zCDvkLe~4^SN16ArhZR!y$=v;Ejnwnj=ly6?n>jcm4TE#FTSuC|Gkzb0xTgzRGUNS> zWo!IqeBC4hk|}Rpcnz+;m2mbn^HQF&|Nl~e6&WHxV1q3e^N7g@D3u;!qr}dVZ|Muk zp&$&zg=ooi9v7RM`gpif0RdR=w{x?ez{y$;qiR#8@MFn))tgxZ%V(w+!P2b zDy05@7c`$oeB&p=O>E*C+V>hEJcgR+lTDwB#XAtAaPA!3(S2yvH%CCsA*`)k zUJb{NZh7_zpHS~>5{-Bw9?&)Nw>?=_OCU5~rq=Al$>P0DM>jtm5o93gEsNMj>eL^b zNW5-sdVadzt&KBO7m&hoc8Vr;MrD~03=hWjE4gl!$PNDm4k8{k`c7sjTUC5`STu{D z3=ah%EcCxelyI-ne=E!)Q(J@cn=wm7CH9}!1&z=b4Vuk~6X3e_WB zHVyc=EBtPf;Sxlm=+b1Y;|{+HSF2-J@;%Gp>B?w4jTLsi-36mmH`xDy&#aBZZ!y0! zIYTQ0Z0*&=LF|TB$r4Sxg$@%S%8aPM>ve_AI@2ENInkSGA zn#o7x6~vo2i<`7IDispsgWb|T{^Lxp7IW-+oeAuJgi(MKsFY}F*ADd~Y|`mGUm%RV z&m@Vv{U_m1j4PHZ;;m1R&7(s1h6P(fCr)Ai+t&ioLeq>p5&zJ?6Q*QDp6PYW`Ib3h zt92;kc&<}ivanQbt#N0z-@9UhvgXgtZOIP|xWCM6F#PgMq7g`H0nx&7Mf&p_pFhB? zCvK+y_lm<-m!@cfZWU)>1?Li(2bTv~S|cIF-yw`M-U@i%RSvZlJcp zPen>ANK~9a2_e{X`%&4qQ=|QqY7T4U6GeaD-qne8XU3jkB-5FDafoA5p-ux^ZDO;I zHx1wc?P;Dnh_{&jyF~8S<_zzUBfAtlovq)Hqtag?MZ7kBMlkeH+N6;K{V7qmCO*k0 zN75LzHmi@4_NP-$23L~6VaEDPRMt7$PqQd0q*$l?y8U_j;h`V>-7K%9PtPE@)oKGT zDf3TX8Xo~aN|D~5b`!>-dW_OiDSnrMF`hvGOHv{BmnWCmxzmsupACxRBo?n`(iz10 za+0(5Q^T-`KoTD9H(#$Y3ZEoZUL!w$DFpI6;y*@5kr-mT^7FVhEv6d&Uk@K z(XR7ZbAfFX!C)COqF)9MwF-N*((ObLrx|lQ>SWid%1ar$?%E@}+-}m&OBo-pPdkea zrry(U*MnQD8#%2XP)}E1K-_#^&A5M6k4|J*uXZ%+HAz}zZ)OkwUchs@)#uq`-x0+( zyKHxTaOwHEyezPS<|IiLVY7b<7rr-)_wcYYOz?2k+!D&Vd!8kG+C->Ta^xEcUf+*a zolTFW%DlNz{iPQwE^W*ryiTl==Cl(G{T(>*ct-usFx+srHabSsz3YdC$@T*=DXwGQ zQt1BcxzAJ02s_O0vWR}wn}YjsDGgfi9+)BVI#mY=+`u8KRgil2e!{`$m?Q7^K9}08 zg}_~HrdHD)yxA`=Q<^J?Jz+7B9&JH&piUdC%4n#MyB<(3?7GGpx3p{*`@={IiB|AU z@@U+|h@9CYZN=RB+qPe|$dga?A4bbH0egW0!=#Y>3%+aP*K<(mmfl8vfTv zW|~nECg^3&T^>hknVr9v)-;+SlYq_3SMWGCYv-#CkNuTuL6|)3I)ZjmP*p1ND`#3U zCbcEAHM{h*OXG+22?i7tvDc;kJyUpNZ8rb@!O}%q;^(^%*aP9+doy> zB=wRgV)sXRZklSo$2!f#fL%@unXr8BC-Y^`C5DGA_(h_3-xRM!HbmQZhuV~0CnnS7 z)QHCk3ff|-jK{ntIEU!9LN#=|ehhW(+B5j>EWkhYgh8p$lH7@$CY(=*bon3g#qh5m zHJSKX^j4QaoU=`jB%2!q5AW3rRC!Q?dDrBL7|VF}ZwFOb=+xIgUw7tYt_YL}fmL2h z@_2qeifF1lbO%VM4&vTqe0P6?soSW|ajM74=4pd)z{}6vK>Xpn{Uel_(+t_NAzLIWXvB>{6>NR+)Fv#SNOth^UGT!$CqG5@{ z$*gNYg^({C1XJ|wG6n*u9s0Nb{Hg;~q^dRg>x8nynf@=mx{V}ZuK%my5}iiXNa5V! zA#|Wx7JfJIao|fGO^dEOnA@*~0r^WV(uGzg`G{FrZQ-p=Cm@AO zjPj4ObP9ok{Q4T15 z$9h0%8tU8!-e7GrtL%cR`lVl=p&hCvld_1Nmbk4r-YWY+UAFbvyr1v6P{#nH`~6;{ zUtDt6Y9a+q=)3+mZDhRQZoQ73X9%-~7AXK5?SlPvs?q6P?E3Ema6Ul&TOUQdWeIhb z6c#=NE}4=#Uf}h(0adph;1zxUcO~lcQ|cR&;U|E$775tj@&NWw5zun*frsqAAt2{Y zW;22WI#mrgJ`HQMA4f{Pru9DNZ*Poz)o*rs^)@x$%PZq+FOPwcxw%3v4+YPyBpm8V z)?%@Q2nEP|nR_gZL~w~>xoViuoV@qbv=gXbL;T}VbC_`tzXFP*LhZXSG&mwY%@pkY zKeRc`t3L}H!@d5v4z8MJ@FS$p82`9*ce`k5bk=$Gtl}eN&+nnClc|sPll`}YP)M%} zry>$qFYv|Y@24csM+r@8!C7)-g0SIgy><12Hls(oyEHB|_6oH>FK(MnnaNfE9=0zW zcbRtUh8!M|IG1?!I6ksFjiGOz+x{cZv+2XYLoNV^P>RJMV#P^Rs+Z`bhpMl#iZrA0 z{xz91fq{@$Ilm2CLHl=iKiMp2psug4p8#O(&pPsGxeNgx9srG*&9W~xuntgF6sxq^ z=c;sC&84$MQk$?Umkqm&KMx$x^qFX`A7+QGB<&dX#)Ung2H0YD{ku z+e%vUz}^)A{*MarMm;$Oyl(VXqka$dV1?Lun%|Fl<#!~}Ao23IftS zD=TYUaxzQV6Uoa0_363fT*E6I2Yh#XOI5N z@q54i@qQ;p7BHkd2~5^?78Zut4mLX6?8kONLjvzlCf)!}soba!u!u*errsh+N@2ZU z{Xwy+c})X@jd35RC5g5}S$rFgl!SC^HKmIdUSYHtr>66D%qRuRXQC!D$jq0Ru!*;` zsin%VVR%gbz=)yX!|d9&ghRA>m z&9@gZ$_(A#Kztv3Q5t$7sn7^($sKaK0%*01T`jG8t7JzqCk!@jHt>WYRkw8%dyU-B z&w+#?7@iMe!k#=k0?$yPPwwn=ZXdpxN3*cMf$i`RghC*Y2b%3m=XBH2*jycq`ke!d zva>=VfA!oSp4>?>nG*|el_J*xe6IOa<6aVNyz^OVq6d$6^L{TF6_rK48bvslfcUSm zuf`a|-Pt}kEiJsiKLH?^KES0lKlUoweB2UW9D5Yl0cR!?pMMX~U$wX9})n@kp#E zfPT#k=#5k5D4<|qi~$;BEAXy)-38MX;41+D4>gPd5&crDW5Ek=cq8yYH9naulTmvF zJh?^#5nrX`8s8?aXNqK}0p73E6=Gu#bn@K~88&`F$;b;al_I4ynlc}R2F|r1_R2h02NOQCN3Ppnmxj=rh)_9M7 z(&qKh%LoGc(I=W$hHaG6&AC7a9h6bHRHTL!1eZ676897CRbDdu467BHrn-a13->B z+;hcdv6y|4lKY;`hs*DIS4-eGMCfJ+{Pso+=cA9SuX&zUS#<;KBA{wx_yZhs>i~S4 zTq+X;=cac9=?=hUr57zDAe$ciJKRWH-3kfN@1^AB)nXIG@VOj_Ls5vTM>%l;msmCF zeB;sIyw5O5c)>tC2pN#DX)#6oSW@8AJO9tQkqTY4I8KZ#?Et}u`V#V|t-u9>rc6wr zze0*rl`ee$pbU`CGWZn|Vgt5x3OB?6d8tZx!C&yQ6!QAsq=ej{L@O4H8v?3*)>oF>;T9uWyMSZA8nTl zL?B+qZ!wTO83+ve1MnWdO5eYn)&N!OM(ry<6_r%HIe>6cgd$He1NB9mev-L~9I9PQ zE=qmP3SRqnDc;N5|NpcAJvGC%z=I9<7YdO&Xw_v=@7I*TKe@75slV~#v%_qsw~~+- zK%RwC1lwuu&N&bm(#QvrN8odr@314R>Q5lor@Oxi`8(Q?YCYa)$>e&xgJ3BOTyq9A zsn&aSF~KYURn;;(`hD#VtgtZPfsmq+4BDrIT*3Z{7clr-TH7RF<@1lU(?Vf>d}x|D%{(mX=QFk~-9KOB1Gz@qHBB_D$IYJ@`W|;mlCw@nt$We< z@j(7!7|^wlIF`FaHFr|HhHH(MW6ijU_uGSG5zq=7FEaR@nReWKD|9n1$bdD7&qKNw zlMQFJ608j0Qbmb@02wH>jCzHYHfYl&55Ow&ouY*E0VJ?pJDgau23U@)tQb1>`QK$v zlD4gY59z_w5KvFX004&VuEz}(&?-RoDT8Z{ib9m-Hh0uF1zr7@&e18L8QrRtlPLyr5n*V~re-}sdc|LHI`=#(JwrWZIr|}{hTp{3oCUFu- z^TEH73DvNnTQyls-2A8qW@4ah{>UehBoj+)3lD7GXIz~Y31Ie?=-Nua2=~MWd5}o} z>bXT`QLmABUv7X4ixT^*ra|I#WeCM-t6km|FBj?Y>e#|Ye;X~u?(K&|Wce*3BGv9I554bjgpxY53b#~3zohd;#>uftS3J(Hn)g%pNVo}Z;-3tM7%X@hpz5n_p`P?oVa+=a zjz@n<_Ol)UL>jF)LC`;3o}WKLECsgmN>avBq@Qvg^)*iy?5@R(>W9AP45F+yZ*NRy zp`jDlj2N6QG(GOdqFJ#edm4ad&@ILliNMcBS|X^WQgE4`ua2fw1}LI7m88=*Q<=h( zmpv8b(X9%C;)X^h_>xRUX+UVp`t-@Y?z0f!LA z(mMO1t_d~*ckeA+>#an3N1OeQn9W+3j*mlki3;Ty%)jsH!G9Ze$qP733Y+Jdy+Jsc#Y9rcy zcmbtO%q$Y*0NM}Gq*QKGxa~!~u5>y1ng`_#1_s*4{UXy``TpNE(5zg^_cpn<_vvyq z98UXhc6O^+m$A4(qQDYDhlqy$@Ov6&Ms~m>q#gXFP7Wdv1Ns{2+>9vPJ(Yjjm_I_e z>DPa=W%8bpvP7#`2CuBU{Z_20rkcwP(va|igK*U}UDmrlndfiJKTMhaoHnTE6{I*e zG9ujJdX5>7iG@|Hs&6BKijHo4D~5@QStyvJiRTAmVQc;4w^;xfU2(B!{R2FZ^sI0WB7A$tp+u7$z<-;8LHUMw;QjITSm7OsaPh0Wb-BT!Vl8zR$iRpK&_s*3*oBpv zbdbfnL&CS$?zEe{8huF_DZh$8!#{nic3e{7}uAhl>@@STeV>XK+0b{16^=+az(_;Dc(hIYwov2FEKEn^i3s^9)VCi|Q z988|;i4;3B`}sjFR0YL(T>yP2ZGj@iv9U1~y(@Qi<`Wg9~42G z<3MTUNK>q@=Ag-^1|ZugA}XI$z5{uwb&C+6CJQh?eg$e{H4r|pQ~m8Bzys>{^y{GK&+vQ*Bs$evkB?8!+uAWyGPRxM+LFvPRiEc=m?(VO0d-KF0FcHt$TOUkoD%`gBS}v0Ip5( zIo16?RJ~PLR$aI*EJ&B6gfuVR-O}A1Qj(HVQqn2i-Q8W%-6dU0H%KcX@ejVW*WTBE z_3=T7Hm>n6K_zMx+QLg_phzmcjt5+YDIzMhMq94=#-z@yNx+k}Lm;W%xw zggX#lhy9|U+JRxEFqHN5eK)3j@N>!qZ-mhGO5KK z^<+Mauc5aW75Wz9yXXf}={B8f?<2mNZu=$hI-z?H5ha&QDGL5g@yINxHARFzo^wFJ zjFfk3LdhU5?a2Cy*7o#>+jrKErDf;+sJdacsA?_I7Vx3&UVMc?ylv z=1a_8DwSDQp@d>obCB=YNJIOtuQ38HprO%~tC;{A|60rwHJfzK zEkGs+{&O6D)v2%4Gewfm^z%3Fg#}iJEpapZNiZO5J}*5+hvCM3#ypkq-MiLZ><98w zn%B3NZ#~Fozg2Q0aK@Pe%q;Daio(HwL)xRZAg=m#U&tG+EKA6D6Jui=12o9Moq+z> zq?xa3SZ(-T9Hm?WQl$7TXz~cgppYcx;?lfpyx`1n+Mf*5mXniHLq&6osN4{MOfUVZ z2Z7V0#kLQ(7fE8g>op(y>>GZJFRWDm-i{V(YImt=xUOnDRW&6u{E)JG`l}5mb#PrL zc+?yL^IoKzhi5`yBU@w$P+HS=J&enLRk3E(YI8=cYcl5<+Sz9HX>U!A4g4Cz3&`D1ZOef(Zv5& z=_`JvQ@q`H?fKo&tadd#>RWd@h1Ax)`9wxgH`{ynM~?!}hgwfbd;C0{ujJrMZ2`Rl zKe3d^p06#V$vzOK7GWYF2pa|5BmDMiW+P`*ST&5c&9Ns-QUYqBEO8E;Y+k25N}}hC ziBRq;$U%i*JkLJJrA~mZ0yBXuqbPZZEGMxXa}fwH= zdC%b(hu#GiDF(X7-f!_uXqB_D6y zH0}>arJpXOfNj2l09%GzNsIgg>%JpzYEr!e`y|>|l)VU)Xfj>$)WJ6xaIaA!3COS4 zzG72UYl%j_7lr6YNK%v>7%gX)2V_Zch+1d~(@8hv6wiQL#}(irm49@?hSNcu{1d_R z!_^yYBo~lCLQk&j8O@ON0EC$byc*PIP7NnX~rS`N;ex z^#$8W#o3yf0>|?uenM}E`}e>>rPfk6#+B`Lf`>`1lw_C1B&JSvDFZWZj$Ke#s+eU_ ztQ2xMj3It~7Cl<1{HlD>*K*d#d4_#ryHF1ULptL<5s*g}T**+tZRJ@fNv zENi8hJKz=ffzZj6z_kT2hWtWrcMx>g3e*h`OnIv=`cf-)3&vxW(!e(gkg0=M=bd4U z^yS)z@LfwxTy(Iw=cV9=U?9qPSjT3()WD75BxF&|P8Jqy|CkmJVS(f>lz0SQTJ*qZ`4(Gb~w`(G3o{mfwcQCCwfiZf?N|>kxO(MdQHKSMflH) z{l3I1K=mLY)!o`(zE`G!)+fTo2>RkM47xh|Ol*h_B2}wk=#EwzFxVza*{K6yp;i%i z#EO3NI!kjBmS7WvO;O3$ED2~=C&)g}^%iLfO=N_&GBtF{7}Uej$|ZfPF4OzTB$Bfg zV`+-^`ZdPVj%=awmFp(Ft5ybLl4jI9ZZxqFG!3G=pEzJ7Q+pDZK%FSf$fL&Me@es# z{eW5%h}wKFEit8a5-x@>d3}iv?;le?nNRM)V(w#rr&C_lDw|rdik$Rsx+_dOPCZ|? zvOTbPq+w?3Ee1yD?QL2i5!Dbf^e!e8a_ZpcKz$?OPOoQgfCCXXnwZpDt#Cwm?-NNB z{Z8S2qd`ut7o4lPM+^SKq=})UP|!oIE)q{IP9+^*%KO^LQs3Tf1bl1#s~a+eE%@U@ zp`K__;$6asut+x(^tHT6J~F>*qpiaXEsCRnunq>`ywtc4CTHEj0+3dG1HVlcMAf+Ij zC_)*x(@2|ocn>a;AXDp?vY@#w`6QD!WHgM6vW8d*gQ5+!2xKI&%>m@jpuxulyWngQ zAaJ9F?%q0zT(!m)5-!Sx%-HVoL#9v-6YrncpW!Gl2#MI^Z@tCNDk&P&UF_VsCoT3BnHiUVEXH?cmVI)EUEC;kqFIX+f3l zZ57SwjkUx@Xf#LYxUsvfFPI(+B#@*2+kj*NK=i~$9%rZeoioR*N?&g086n%943qnxe?i{sv6Xj6M2^VjXuNEA@xJEp%6;P zHX7$_&Ef3D@4?8W_n)&a6&y6au}x@5j|y}K3$N^FZ-ZL)vi92h@zk=5R}^NGgpjGU zpc-O9<@vI^kdDV#dG@AHekr+Pzll6X);sh^6ItvCQIhqkOm$1AGEmTAPAG)g7{6;W z0*d$Jvr#4y_)Pcv5#4QnEW2OKJJUf@2jd4mpZM!G3IE4yGmMu9rkV4deUn-M;S4X( zI~gr4;tMRuR&H@vRB41e0`SOpJv}|&K|)Fb$-6N&8Go>TqN=Eso%g6OUK|+tXE^?m-Y+>pT>}L@{{?dM$z0Jt zy6pS6ZwLU0%$-9eTvCaJkn&^d4!r<6!;^WxaCAClR4)PcsY1t&2{N8<$+wUw#~@5H zMpivc-{g`QR3wl8w$h)18kxi`)saE3q-1oq%JvMIWhn{aZVl6wqlxs3;Reex{>8Cq z0soVqdJkW&Y4&{nF-{g1mNFSA-5dsx34U#b(NxnVz|3-&Ot54UQKPz`gc;}MmG<1h z*g07V@_iOu_y5ooczFJC=X5>VcQg^>-v~8Jg0iNDbG0Wb3zu!!LE9-EC(0VruB_)V zBi_*h;Mi2NdGSdRmZV4>evk%oevYz#=l|+oe#cjskp@p8A$QyJQ%f4|axHB(1q>@P zG6bttxZDoAknD6>dZHFpke?I{I&$c$IXsGRaCMuNj=SQ;(^yS{kFYI?Ac>NZ1VQoC zN@XGC>ZShFS#F}0vt=6e5^1c`=&?Do{#W`A12`|Bd(X@i7&`MoLzzo6E2uJhX*{x; zptf}mqiaQ@E|7XRCeb*fW{Z&&f9bI0czvKUkM7svdAk{cTe_(oCkt)6*&AfUt;(hR z!$11wcv0z*TS$lmR5*5!|5}oX5q~SE`tSPCgzO59h_8V7c)5OGf{1&$B|#e<%UX!n zMc*|4RnjCH0f%&DpIyoLON*@5vqeBi&zsBzZn)zf`rnlUALd?)cr z@1gjEtnRc)wr)gdQ7IM^z%Fj6?tBurUb;=%MEJhWGyjEw+Uyx230fGZ6}? zXF8*yZv5=u|TLK`Uv2;;7B(mey+J~qmJbgN{QBj zk^2^4+H>1K*+<64$K@Z^pC3ueG%L&O+d(PBoYI~uhxD8Z?RQWF8N!Tz8gBZ@oh}`R zZDMnl86FARK+4xN?Gk&r9l$Rce6DC-25wN#5J-+=p=_DS{uhPxJj-9LFH`7S@8=HH z;tBUXeCM|NBeqJFpQL27dPv3cnj#p|glsS$0n62H`ypFQc#itL>nP+SiKuX*ui+OQ zV~{hSTQOCf=DqPs^(~XS;agb*=t*+r%@GdhVYaCa>uL$<=ntE?#%4=L46SHU#d)El zbeJ43<$DtDkV9RL z@ja>p8@@)Y0chG|2Af1~!bgq~i&T?oQjsBE8T9Gb5ms!NIuZe2jV(c3OlMidExvj- zDn1(wP@K0&Pe>X^73Q3n%M4`J{t#{kdm zoSYh^7cV=Etjh8R2!62>4uC^tuwwTZP{`!dhz)}bG`In}7+YlcF?Pe`2YjUWrxR^i z|L6jeP=zWrLbB0^WoM(C(_d%?ILiATpob*PreLd5ATY*qcas4$r9nfhH=L*Mue{@W z1yZDr7~`ee07m!lughJ#leg)-am6F=rt==D+Vh?^>-#=&1e51ynm?igC^3rORidhU zrNao2_YW-O{9pWQKZ<1s_?Z$I8>8T%xczq89yMKT9W2TC2PS0ZOIH(1$|0*xqia_aEI3Rp zbx+H}NcH3|hcgm|VrJ=k`bDLKCC>Q-6(x(ya?BkCm*ixUK3t3`{OOcH)-mWm4bbZ< zY)i;!L-4DURvAorwOdC0R&|A*$#TdK>DE}`pIts(mjx!v)U36;Auh8D+@*GvHa|8# zUH@LyicN>`$~+{7fLll)LtS2}lBjCl@MJp-;(% z$NsVH2FPFCg!rgq0|FqE&yd3c^>CVq%=OpdoIoLwJS?7u8VcP|i3=Tp`j-yS+Z*-@ zAojHqwFBE6lmwx;N!)b;F$I&~9k%B<7^K?wv%V(Kg+XSH=F6EcxF}MkdZWU)FvSL7 z5vk}Jw9{mTSoaj>Q#Ca$tuu;h^1^HYrHo6Y+U4!FJ0cm)O*g|~MleQ>&k%XC-qoc# zbKz=z#uldr!K~CUS%|kTV~Y!8h&59tlAJl-=n44#bbp+96p>rHf6xa+(kBHnyM#BA zdGV&CW6G)Qh;np;gs8^dSL0te>mpI{rL!<^;_O%3-Rm!t>EC<|m*;mlV&Jk_4@sEY z5=_+Va5b?Y9Dr%4TwH7Iwmm4D_*OYT*mGI) ziAkaQq{K7ElF5H^%1HP;GE;f?cPdlb^o6CiVkjWBLlTV$tB%bGf z60qi(F!MS#Q8{PMggwskPa&`SHijS3o14jt$W>MebYH!JQ>$`sJa`g&a1%;y{@V;7fV^7NBGfG0M? z#02Jph`@4Vq7~O~0p5m`s5frM;KfEBa$Ijp}=$T6*{bHrs?r zvGCg&Ug)EAizE^Gl$C91Rjfmr)_Y*BO-;EvHv zG^$&EB}1R|xsDbZk?SOCjeCVWQP_D5!$YAaCMWM!Y?lDe5~Es1i*|%-VT+!-j+y%Tq~%y&D{uIqsz-ssn_K6)umoAUmCCj3ZL#&RkREkNgJpFE67balLP-@fc+B5>T%)+R|4e#7R+cZYD7?6sh7fiuAmU9(Nr;a8b7WY9t>B7&ScpMT*@92ZkO~OX?Akr$}uUrLqxi zD!Cw%bSf5YN4fd*JrRozmrC76byv-auaQ3aXZTZCQ?w&|r$OIiQfAW_w5LRH$fw9c zm7W-w$+9eWcjZUlkMcwB`pJi<3K`nvjr|Lv`{%?DohGC@ULeD=v)ke&sKp?xkw(jj z10+cOpaY7&D1jHa3cBH-2}&gs(+&?*@Q&xj?E8XQ#HChk^+ab@HJ3axq8$_&!~Xs9 zc&{b`rCf+K<8vERMUuHAPLUV+L1|J z0xgzbFNq`S4%X2_b$};`xyB*+$m|xa$kZKc!kZjpsrk(#UfzTNk}Li9Tt7+!OVocsY0_KejA*OuO_~~zbNY)d-;7>OZE4Zh2X-?y**%o_2x^-{{7RmCIIy*t6 zPeCZWjw}c7-uY(VE$F6ZP=GDzHn~xL?`{h_MLi;!v|LwCD<^K7`1ulX{Mfg^!hzT$ zYNukzBv5{KTPHRH#!n7I*>pwh4O!>6sX}HW38zhPbtjNZBv^LZ4??6mD013olbEHn zSDU^at?4fwUfyM2{%-!}dBVzyzxXFLZfEozbeQ(P2d5pIW&abee?}#ms4VI8Ac&G( z%Q$C&9%c4$HfcT4vC2YeH+OVhkf=R7Ux}MBmd4G&LODUSN8TOFBq2~oQ8B7a-9&@% z?3k0p`GW}!TOK9OSs$%I9ZuSxc|aR#;AC~Yd<0~GE|w39;9ORRlfy5KF;?!8FvFE0 zvK~n5z^Vx8Qp=TKn`mt-;l3gdKAGXQnxoOsEicXyA{Z09>s_rikmf` zh87Xo_o>!E&N;(>)`8t`##tks2;*-Ch(0?U8ez$k=SDYF^L=Wq&YIX8!2-ILPrr5| z-2i8;QmRUc9I|;z_^DbP1Bo&BHa<2cCg&uF#*u79k~|*g6F43FN8+gy@j?R$-ci8e z7?|j031s=>>l*Q5iWceQ7QnMH($mYOUe{-^v$0`+H$w;)XKB>181($P7yQFzn8WVokMQ489x93+p-R0vk8ov$G?^byRV2JC%AKBlbIi zxVD_)HVQJ+G#ptE(%F_rIPcwK6`37qK0`7o-j52vTAAAT;6Qr68f)(WrK{%cgnS)qn>(B z1PqtF4Aja#J#%LYAYyx!ixeAivE%f+MS13Y8#r&0-i zN~9zXCFcpGu;=~lD6m6m(i{fKo5&_mt7Z;5io?CbJTHhB{5?|`PO-1aJ){gZXqUoq zSv)?onmeyrUzOHriJ2uP-0JRuX00RhVi(v==Q2e=iH-e+&pEn$D8QB_PJqa%Qs@q2 zrz4I|BM8mQ8x={bI=Q<<$}9VyZ_JBKST$VXPTN*oC4xV|V6DBYP`BJ-D(m}h(g)$Q zYLLNO^7SH*GV*jd$drH=#pNw3GP2VUChmmH%yHXUXCKhuF?&2D8+xl077b|R)y7hI z%^u_{W(D;~!rAC{Q^8@s{y*h< z`K~U+hQj_|7aW+UV4yUKrpu2H5@R?&t+hQ13m`*wzpu5rdgoKY>Ab( z{`tLDwS;wPBUZGi#cAI_olRFYOSI??A$5@Bt^joArg(@RiEQp3VEHQ1xUgEz6d-C= zXm1{{G!qT290E}=(MincAll1cZJ1-~BO^T`bd{(u?Oe$gBaLsZgNgW>%R7`rx)v*s z;=PY8c|)s>`{P1|4htTNnc%(wJ`hY+ zm+CqQI!_h+OkBr24^?XjUZLMm&yfg{FBhi%(4iwjZmqjdnX zaRIuu$)oi3Gb2D;_2$~wGYtrNr~p!INfyN$;XFoeZmn2P^(Nl@BrLXxkYNt$YFd1| zpjW9*7I3Y)D$YaG+|--k zrKPLqHwen62CsyC_x8xcDt|1mWI)o;JbCr&+*8H>$KUy4^Yny+(ll0qm58>7dmI70 zNiaa9MHBg+0H~Lv8vY_+lQ4+2scZT?Iy|hro1>-Y0ION4Yqre)I3!(Ht2e_(>~TOQ zQDZ6c%#!3*cTx**+yUbuyL51mWsi=46z5usk}v03sZKLH&j-`_WqR@R=KpE0XtuLt zwtjhDO@w8vG}*qBHXpFcUbQh5ZI^a&Py+aos#Hw?)xLSXutfkbV8_arEDwq$KPr6s zRH7z_m}(B2y_RUHm}!a6@M5cR@*uc${T*nzN|nMkqyE@#K*7R-PL@cHVehVwxAG;> z@DE-`5uH0CbgPta!3WD;X`zC6PTj4&2%{6fs>lN;?cGk^TYjw@6h!DF^SA8~OZO*ZsOz z&!&Ak<8uG=zW%?jAJ|r5pWQ+5@Y>ND=7Z0*(8%;VjPSniH1dfOOt00av)gGH?|!ce}lWmPq`M9MiDT<4hLDSgt;hj+?&FKGDA+Jz5~M!ce5M^wabT!lVS0G{cwkQvp(H?W zq51zljQ?wcPDfDwC<(g)ZY2$TNI)5V*|78yi*&;DNmh@fn|eX4y47M;rbon3Y=QBz z2|#0OL4n(!TaYTlNSO=yT@f&G4G^|-<7Jw8Gsy3Qp`u+8$A~8*-Q@$5Xu%>`-bU7e z%$XN93Ixb%^b^mUq}|{PZhmJToK+I?3T;l;w33A`cV|V-mFA24Ngg|lIrI4MW*aEp z)y%R1saGRck$VJelj{H2$Y(}83~-b_DNFu?lz&GX;fr$_fF0pdNG;;zGctkQ6Qv~dVLtjj*s`RMpf7RwRr1Pb z5Xdgd=FER$3635uR#&%gR$Mp^wG`e*xyQX~*iBR29}MtBgNlta>}*)D5&HO$_V5%2WG!#LNTD{Pcw%Yrn{ zVqsS5(*)K6HhnQRX>=M4sG(sFp`DPBK6S>o=gre;Ejh2CN=MG8CWC2P=^kchKk|GX ztg{pOL?b^vzmc5?`#g@4Ql=TQym0J`f3Bn?v2iGFmSCU?TvfPD|NCtLH0D}N{aG=? zmy8K*kjwU@*YtO8$>Zcj6x`ypC(eN3w94EL5BM{j!$7MiP*PHs&h&uuj(&cbI@>c3 zbAXnY_q;0u4QejvbS0f_&Yu!Ze0*$l<_@6x_UCb^B!) zfs|1Fd#`&V@S(W@b>+MXgO;gpDkW8h@eJ>mt#GADC7;)~USE(8&asw5aTE&1KN-N}xnS48Jk_jDLX3hWR0ae|k0tnP4ReU~!F|KK;_%ny=!$7rMj=St|GgSrp-~>& zvG?^>D@=H)`w5|AakRN4=N0V!B-!UP1JmboEJttsX#WKDoSav|aDOl^uHzBtg3TCt zzP?S(ekPgOpJ+x*im^0n4K|G-RVIgyim`+pL^tKKjf!Ymto^cHc$GQ<5|q%yX7Eip z?ZP(l;Z+WkNibq!;hdCN>Ik?+?i-1Wcrr`A)s54qmq+hWa7l1M!{RyiLjTnIQJ+za35iwMaHmrC;k&V;;fi8X#VOI=oF zdazEm5+xpnXsAX!A-WbNA&|&t#soYg%_KJYNSb=8kP_W||Hw#0AX5!sx0s|c2E4DrkGBOi*uar)Si)j=D5#u0Ecqzh~HbikVy1k6#Tp=#ky!y zazqYHXOzei3Y}q9P-gx|J4%K;W(PM^wCq3@%jU9M5*adca5Xryn#u24rl;dL`sX_d zW@3MwEHB-ow4K#^*Fv4>A&($=WSgG|7~9!W_UQ{@FzoxO2`dq%Z&BcOviX7{?nub2 z=Uov<9?*O5e*p)Qd0H|qc6T)Q2wZ?_%5H^z$Q=IXZs!9Q=y#mr{Y%f}w`t{5Q)Am# z$P6ji5K1zWX$xte=XNN=X}~r#Bu4`m!je=sE8E1k@jcs^l|%EC*U(02D5|4uU7lDE zbxs?}+0i-q@E9r11f@i+G^&iozX<|=FGc($;EZ*gwH>zmfj1}yZ}jk$WM)+_1iC4k z3#Kfa@GLn!KcRXsz=y)|4-%g0WMWEtjOE{{NvaIMVHPJe1|`lOaUUKOh2y% zCoI=v$UJ9{%j5O-8X=%IA_tQ3tvE*Px^`+^Wxo{>!S!zT7N8!w{C>F6bJexK*PD<~ z@MLDQ5(t11?3TAK$CclV?H0Cwn^VL8mtUgH1v3fc(Q_v1iM6YgI4Xc4i%L_EMa5$g zPSgNcoieM0BO@!T3Wn-S6U`;%ECF(iFa=p5h7!c*AYkx<@I$^N>Q7L-mjkr?c)mM? zB>AIO0E4y4M19A#0h-;r2P5BMU}5DGn{+~!r1ij+X-bSpv$qD2ka>AOC&v>&d8kvX z(k)Ae{Q)dE(U@Q@1?}U~-c%QzY)M?tfIQR;ve$asz4L>TVEX{KCgS{d86D< zFN5=L&fN&Fhi1tG#bgv*s3P6ocLSZiV_?mg(b;)u5-FWP`EyM6-S(YEQEHnlzp7zo zR7{p2cB*DfT3CB#SD*711o~JHO>A2dxXHB4vX-sAg)eV$jY{x8gIaabvE>@XJ55YS zCrK*D<}=~R;7G0uOt#93*}!Q^O~6J%+^yHT9?$5+FrL?r?mH zxaRt8#N&nZag~ewd`_m&99dofN{BxbtR3;?AY=p|s!ZHXjc5(10pP8UFrFrycMv|P z=saa94anwNapagQjBPKJVJA7U78J-;35y5|ArRWlBLRRUL#s%mJQ&aALp}NT>lj*@ z#*@c?v315lveZC9{L!d1KE}#}M-YUtkRtGWdo@C@z*_XBv!FFnmKDZgT;g|w8UazZkab<@HCi*$JWvzQMy@xtC|O;rQx*~`DDVx!3Mv27;Q(CJ(vbKzL% zK%;hkan0+bF+w%MF&mpN{qiyv86QvioFa;o|L zzgMcooxvHlt5av`Ptxx0Ec?4SUmZcH(d-xc02n_^4EsR&`mAnj4SdVR^^#Gp$bUC^ zqsiguF-c`;WT{EhM+5~fr*K^vHZW9#5+dS|OXF?y%SB@GDqZsnRd#rIijF^x(w%=4 zVTc3+OEkvNJ#P&|robhDCPoRb$@1SqKiYOycB8STZgIATDrUog!paJk`5WWu9LxdT z)ZX;9nSP1D4>l(fV)f0RwRS#ePi5`n&;kV-=>{9jZN}7cD^5`$49sGQUkS0G9#=Cz z6?th==f~G^olf}BWDAOhtkS|60X3c|Ra*dwM6GQuejKK^*60Vj*2JJ>rF)8R1zYqrv2 zx?{H-_MlcgwH{bpLsuOB9(IET6Quj<1sh z7tDUUwd53aeYUe?0Fg`-7E)0dFtQ=S_saf(yOF!97ZxHXA$0)fHDRIh(bz^_taVg z;stZ(0oL&OA8N}U=)%BA+UvuM(0)pp{~ksKx{(+Z*IVGg%T5kj9O3v-lfqsf^XC^q zLDv;=FC1EpOwnCX5ZU!#;%}ZG1;!5#haWt6*@QlL_OMqD_cJicWb>2YlX#JvroFG0 z7%9sj-%w?XBv%t7WvjNz#kPPY%RL_(6FbP}NB{F%>wIdoA&16Pm$xIw^n-yI>Az&d z7!FI!JfQO4iopMNGA+i9g!6k)hvEhS^GY3q+No1^JRL!en2M%S1jVs~MP z(A-`2SjRPdgPG++%<8nqGv}aw_AmAU4E#A;WHs(r=DU89(X465atFvzXYrOc*fHtl z1aKu(ua|j;f7QOW{M;Vo-{2hh2Hz%L{wlg8Fs99e?6T!Khm;dF)zCYrf;q{6bXJ}U zyP){PC?B$7rsh%O`11$w8#}8+Mv5{hHmukp3$qSwSIHXHxaSLA!QPb}3hyQw_Sx?N zwrTglBks35q91VH0eAmKFsjBOb|@X)%=KG_-+2TZLnHGPaYb`Rn#LSSf~iSwVKaJYx~{(RVK;m2swf+5>sC3*2dcw}|_;)MzJv+76b%xg?XS(;;pu;c{yq)nta zGJjtp7(JPy)W_E`YSm<_t$)M~oHgcCd^3d}$6*mx+leuby_ID-gK1`WX0T!OLWKX8 zX-KEN*N;_KFF1^qo^`QSC>c;nb$t4WX=%pM@P#OhEuM5E9ic=qYb=SZv*u@gMVp)T z2xj}=Wecv5A*j?>(W|JKL+jiY>pj@ELEXIL( znAbMtY(6rfZ=gf5NQVI3X59g?eF2oQibBtCK`>TQY&COA+l$Zl(>N0&=hrzdcyu zMEr9BmaWy~2-9jYm^jG@W&KO(Je^VMFDb=In4fbF58qzRsqKdfC3#$Czc}O)2%zFD zdAlBi@o-DpFefv5S{gWha+#ytyZjFOR>MLl-~i1Jwnplytf1`dZ1OoORN@HIvZ+jz zPCH7kd)h6nDrZM3e5U^<8dr3q*U((#w;KU(eiX{#oO-bP-5^Nb(ZMh7IVzgDFY0b4LDfz4*Y#5(d}_kHH}b2%&!)t4`=j$v)HUsi9g&^ zI5L$7?AQs$lHdo$2)3cO?7hQ_n+ju7uvqjf*zQBv*&zi+pLyl(R5oU%WH&$`^2RH_ z09(~UG97aWHHQnOLnK-g&0BAogR|oc)L$eY?vG#kBM5bl^lVUXZ*Co&iFq+U3q0|h zo-C;l8FQuAI!qC_UtbK}ZfE=aC^eJwJkNe6?OtCm<-8+^x!p?_E#eKzLwO5c?a$=) zK>a81cs-i}(kiU(V?GyhvtG@ufJk5hFnc;Zizpu_u{<`C(8e1|og{=tHqvs1@4=q|w^4sl|QD zBYytAeFzC@@*s|{;h3MS;}1OeIps38dXzb2KDLk8GS)OPof4Tfw_nG7rpBs%Cg&+{VE&JJD9$C%%eO3xD*7`%W+}RS;YnB2F zrDDfL19&Syn#>8XY5yNwBpcy;EagRi35;WS+;$mK@z+SHZ_y3K5-kai7GTG?*}Jrn zW_#p5J3i7&^(^g5!WgbULDRs((n!mQtYvi!zYKOmQ94g=l_?ZCDG@3RN$#klD6B`c zhWXE<`+Sj7_gK^&b6x_?3Mpp~{F2nlxg3w?q>Q!@15f^8dE3w!bCB;}-Ve95SCLPP zx;#UHJz(Q3`ezD{DW_$W?=?r@$*&;a1k;qq$5>po*UORVq@GK0tV7Yk=q9C_RGJ3`|R#D}~FG zCk*%JQ}w4Xd0a`v z7b^u1nxYDwfLk2;j5MkL1=V-ua3IUK2czU^}hOwC+dq-c<#kl*K(evw;;24`ZE+tt;k6IY|ZU* z?JILC6+8cjSj$M}Hk9@Ddn=t}mF*c|zZxA){C}UIh;SDI;6{Hfs0;kB7GMgG$392) z97s43oM%>YIQW715{dD}A#^xg>+frVkiu0h^>@5fn0AC=$lP^GHU8EjqO7)~*<}oN8qaLP~v|J@EJ|$7DtlWcx&Atosa=#!{%*-a7}noStAa5v0Bq za-u@%=~0{*=s9-%-TRuk^BaciS}5xq0sCps(EZ8HQ0_GlFcZDz&7XA)PLV%xixnau zb?*QF8(z{2)sWhnK_90O@ozV}mjujvFLP=0U1+GQ+P-1^d;CkT${R${NbxkoU-t;D zj|O;E9i4g^ua^T!H&CF#OTz+nfknl`9%)uVE!W=9JVIG4PjvwW6KCLwacJljl%i8_ z1fHU)?UJt=g=Egr7&Ar12kv5*oJ5bI2#_s=JYLQHdpY1MwS0G@mO*Iw(Y%=WCz?7( zr`nqzWEIa>N;`YIxjG#l+91BDciDbClr%NT=>Xk$&1XGS$F$@>RH6xkB~?CbI5af0 z=SQv>EEO{;3!uq8fv*Vx0ihcpkjK*~f%4#Z&d1GQ(1gMm9+ME+LQGMDkg{{)SWHnJ zz733tQC2V2MD>nM#858hLVsJesz=Ex(yfMY>lg~O4Oy+SrbM5~V;b!;Rda*Z9pLkj zWzJ3*Kg8S~f@ak4Z{Nf{Jr$ZKm_Z^#=ePf8?0*aqUygV>V3m^G z%+A)Mfp&^C^C9ECEt{Vr21~j;&ssU|rBbHocm^*5jSXg8y*2J+1W|a&5Q$X3*G&&I zN5eS3J!fju{tq%|pS@J$t5Pi75Q~3*C2p*%Xq_fkgvn+zf|aR2$gdwJ`ta~@w=dM| zm%8^hQ$+)VEUCwKy3x=!phm87+E>(b-%ZqRaV(T-W_)?6h&wh+Yr`Uib#>FoXUF%T ze?e`YomIDXgWQK>CTt=-YoR*BSz&jkgsdEH=3RV4Bqh23T^ zTd|2ds@oAQ2nb}j5?JXrDOSpVJl`zQsU%41cwoz;pomS3f3cGrPb4vFU-!M?@!mtp z^4fqx>AF%ZUXWXFM8L2Y-`>PnMo0Pbvd{KlUpu96kp7rz ziIE?r2Fx)p4{$%u%&2~mD2bIu2FWna&Kl}vkk59zS4Z*si8xTdCHXuKlmnYTsVrDB zYzydYQJSXB7Ryon;D$Uw{Tq>Mmt1&fiF;v_oDtc5H{rZ=a|1S z{v{o7P{LGK(uJ$)WSyi|qs(abWI1_fplhAoj_@yuV3^^W_qz*?@+97ylYa36h7Z7Y zl8TW4E_71mw=lpr9Rwn-wq6{SpWFcbM7{u$Q;4#ftQV^eezvf`5T=8d3k!g=YS!`C?Ld(M zg#T~y0I!zxF&vk95NKg9fGSc?oWTnI$81F4B6yPa%Vx=?A&2}ofHvI*o)c9kL*R(O z<9A^I?3pz?k;_TJX!LP#NC>?R5A)&)p!a9LGETrSQO`PZ+ltK83>hgZF)I+a@+1vz z_i(#NCN>)^O1zH?BvoER-{9tGX=ypi&YH9I_V#AMgT}zdF6h*VdwGSC5k8sf?hv_> zmVI?C&b|2W%~Q-~O6``y_%T{v9)TYq>vhl5fee;Z`65uQ1)PEQM$_cyg5N)RwtH|6 zPG*KD6T8EBwYe}lJps!YJ|8{}m`N2$e>8tKEY!@0wgrOO0C?5hz5)Dv>%H(0?%6R_XnC{~ef_gnKP}{9$u=pMj;o(kT?2S|2dJ-mHDC#>6QA zF0gI>FGqo(O-Og}WUb>Jg=9oG;9=%X^pfW#4-XGXd3oF~O$(7sP=L&R-Rp+vCGiK5 zi$DvNx+Q}XePn)*>F68v|6c?$Qp8)t9I zdiVt|V9>I7{3_47qODX^k_$Kw61C3znrz){Z|$#7f3{RuQv6$Dg?ODqZE01|ToJM< zzf5j^ap*bZ5mxkcgB&<`Bb4_g^QCI=^%hf-APj31g&gM-Bow42CFVt1Pm?gPuqbzA z#hH@pNMS}maGXd1gM$Flmz2ZU;6lrREyf5C_McW7O7Alva{xue&Xs@$d|VJLWEtd{ zMLk=b33TU|%?Ld}`N1Ga1O(?9i@;#uNWgv_NG$sPw~>J^$2gakl#a$spVnAJ0v-8@ zmORYg_c)J+j%F(?jGRkdH|7V*4Cd|GS@#4Q86taX96$TnjexuA-kMYARTi?-zH3(< zi1&1@u<8ThTi#s8{Rp@i#h>bc+Gm9#T3{=XIkNQb#S+p6^xnUwtcKA)@hMu?5V-z5 zSx)-9PQ?zQTk1j3&N7hERLaCt*}V^qelgQu?fwvZ^F@q?tqOqeOwbDQ@3`$wJnw-! z)%Id*ASpQ+_ewGAl77p0pXJdn&kpY76 zD`<3I-!p`R0O!Vq%1V#pg^&N<9%ENmbb9@QKTKF@?E9MuMT*W0Bfwg@63MOmXZ7=& zRoh1rWs41Ax#eFQq;pU{y0yK!Hz$YhZyT-Fe!hLaFZa55yaByN@F(9r{xDRE!mh@| zn;1Uc1k8l9TdcHY``B5Dpa1=lo21+6Su(pz5}l%H?3{rBg#aTXoHqj=)u~m!%uX!r zgj%d|SYF&gp|doRT1QL>{J%8G!!&|o~Y#ye0-LtD9Zq+Y&SM zYJ;PTLNjH5JBL)2etqfri0=CehV&H_LW+c{Jl0f7pwehBJ(_Q0Hav zyNdhDX5IVu1lTaIsr6WYKQedL9K)lbnA5q+ckq2V3Ho~081P1b<-iGvpT+f9^8>h4 z#EXL^V&lm`s3t-?ICK;l)F4UZbpmSF3t%$*3E&eIZ;TA`+F5`_wB z93xpCN3=tSfI9vXp&nkTRc)Y{a|Pr^F5rD0QZH2T;7eT^zR}k0AK(}p ztFO$SZ2od{-9)5QZqkxevqz`KIzeaP=BM$sBs=l$^Dr44)c?HE!0H=|J}XN(a2zef zaD3W^G5bA+44n!Jytp^17P11eb_Q9f={(b&x0~=|Yc)MwPj|(S7yH?N_mdcz3QJ@Q zWvixz9|PVr&A#Jq;wsKGr^P7wn3kStJNEB-^Vt{kjF4LN74$#y8v9LAT+2cabIdrT zG;vOxJ+0P4@y6F^JlzsHrS{$|MGaE(uxi)~Iu@CYL$oAb*RlFe6PzxMQk3J9xg#j3{+eR1%w#*m5qw3Z@t2sAS_2A}(<(P5h!tmvP*In{RbS@pqYkRAG81XD`R_TUMG_l|b z2;Xs5F$ey0S7yu1vLULT)90tNXFc5_s~nt&pK4dHLQR0`tmDtwde;)NqC2l_Gp0OuSwh1yAF}ay zYk#IOqCf4`O|@}Z_MY8yZB3|fUAHJr61`iImCDKv8#g0EN8x~pt6m*H_h$^1R7z{W zuiYD9#z0aKtEIWM!+oGGl+^y>)0|k%I782Rv6{9+8u^G z+_{=Z^ExoNr8tC&WbsjSY(L};r7#3is#Z{fycmR6s-hI!*0Cw1>rXU8QQbU^JJaX2 z6O?YmiUcZ9^mhok+F~CBInQ`bl-Lq`B0G1Az&Ki%kH9_mfTbxbVlUk z4o3u1pns#bTe5z(?#(*ow@cCUcjkZeon4dAq7vXm5)Bkz8a_dYZ!VETTDGoE7FTmh?Ku@9*QhH0Lh|) zNJ}^a4eCpy;k4i z*Jkm%ZAV?9U{fc@k&S>nyIy7)>ob%GkYs$oBp|EtVGwje6~{G}_z&_fi|>p=hTgtI zvTB?VZqQ9br7-{Jr*M8Bw~}n6DjEt}E#W4V7kOCvI~0H9=*!^ebgzYZF_0)Dn9=6j zOSW-hK6*nYPnshaqOW=s*c+BK@jX6$RT24&W89evtdL`fedICTj`6)9kKg-BjfJF( z%_tbo*SRZRfh0N-E*bV`g}eFiPnYng+!CUYNj{hei`HvwK;>R>M=(i}nc3=|_~SZ# zD0PgN>y&5W$zKhWyNAo&yb!`VQS!4YO>M&@{$rU;qk^t6KcVNBY-`rUJ8`t4H(hgr zpV!wZUu;|K>rlHbdYYGO;@&wDXqco6xulZf@`2^%?{=9Ida@b0rFdOhmYZr%an#o? ztayrl@u}#z^I2S9hOZc;Qnul6Z*WtE!)BOm^KP;45P!@B8*c{8 zXrk}dNo9s#M=USv3uB&a{cDQ3#uC%(Pp+Eu1(Qa5&u44Zt7ker{$@bHEQW(o>N� z6f_Lwq_{<|c-iTeG?hF)UHbIAL(xWQmmo*R@#GcifS^B``PCLT*t%hiE-~nrg%QP# zqpTG9Vw&;##K+!dp7lXXu6*krp*0$9hOZL1p|{nU$aF3G3!>Ffp+|}QuE<+aWA!FO z{H%wyN2T+;W_f}RG4a?RAV_82T4T=J8l`;ZoE{zV`zH4q$PbjoFnKpm&u27u zyn2+5{q)51W2SohOXujMfA!H+;m*Owi0DeIh*!Brqahl;&s2JzEH4({wQrkf&hYrK zanhv3jEP>=zU|3Hy=9)pSQL7htnm8Yx^n5#Fqg-UODUKBNLO^if)Ur{Xs<@Faj9VJQ+wkt=Toku<+-FP*AmO(&Zup(#^ObJ+JY$TEZs?I7A;t({k27u)EAh5#o z{f-WQR3EE*EJ;W>8|1omoVaSXtd}YLqcSRIq^?>qlYObT>y3!~u1da$_40cGFo1qO z0Zv8j{lNHCzTkS;!I+d_ApLUHK6ciT=4Uy|079uBcMT+thpSq{S<4i+uIp-zb=frctVi z@pyV(T?^e_z$07Hn^$d{JZh}paM0Jm^@Foq_o|4;haI=ad%va^_98tU0X@3;J@+>z z1pQYc+~in6(0vjGjnD_*$Zt#1_aW-y<@IN}Dtqrf7vRwDR68SV7IR)swI4LP_URzHXU?$%ek5wrB}X|-rkBQli(F$IcGoAM1!;bd&Sa| zAaF*nR+X*d5pg;W@^NJ1o3gPzy7_$JK%AltYRMPA7toe!8(sXbRH);X&%lM ze$14nA0InP^n{+KLjY_n)~$CEt#O|Dy*U5yoFrXHK6*`2wjIo|4zIRyIJ^xt($*!= z9b4wEH=-W9lZFX;9qsZpz(Uj`zM+=psi=FENa0vw^+JCF<6x3RR1m=lwiX2TtRj%R zfQv)Gd@W#P#GtLU)up`KqCM6GqJHJTo*Hy)Jg@+Kro?5cbr6gmyAF@>%goofw1|J7 zzp%q!*dZ$JZ^fCrEygx3+T`ntns7Y0xZRSRN|jSop?&MygJa|A4K@JW2-{F>YDO+G zK3nxB?O{9>^$oh%#JCFHqdri`7l8mdaMuL%jmvj87lz>)TWK%`>@PYwIaw#nQs6%` z2}^bWKR}WJb}^#tPVL4AngVi+_(8AWgWYV5*FwIIa@T!A8R%JnU?fVCo(jV4Lc4)PT3MH)gn;Y7|gF6v%%6!L>G5vQalz;^Y8TL zYV+h8RJ+JEIA8hWT9Bk4BlHw_q2YJA2>n`{T7F@8chJ&@t0&iyLWes0PQPIxx9Qh$ zqCVB3T=kV2H*OSXzG}MRs1)W$>CMwMz904;Jw9@lz2lXuwLEV8;qK>vQt80&g_g;s}3dMKBg+BnOH&x>`pQy-P&1~gx0o-xy3J>EJWW;*2wm$sKI(E@g{=Hh#KC;@w#YkAJS5(tu>{ZeXA zT%2+87gk%#wJY&5+>~FnJBk3%tfBabbNs{rNHfPYJ_h!6)yERZsTy)_p|u2?w+t#; z48RhYO?!VEOe~K)1Yd|rW}-_d`Dmxa(S6o-c~3=5M^UZE*qlsx3+Pu@IF2_^vliA` zxVu**IlUv~uCW6J^_>=Z19gg|hsB`&+B$cprd&wUEm-I?kN7xj5KE z@yQ*dslCM8f8Mdq#AD1W6KgPrc%DVjlZXVo6S1efU|#q!@^j|5Q*hV?A~sgsy%zd* zn6v-}vWmdZA>*tZC?F{7k0`2-0*$W)J$cs;}DJ4vwOwx`ibemt?eFvK| zi|oalhRq;i&dIg5bP;9#6EU$Cb92$c-uL()Tec%u;q3l;zI~0Jgbmlr%d4-*M4DCR z0ho0Kvki+9M?25SquDjGzCHvFJUd41izQhX0KnDz@AehOu@WPTX<^(YkjMTZsnLE| zP;~7h(`Xm4=ZgTGswu`rPQ-U}LE6aj2Ti?M)myYvO#-r^Vtb3F+$Bm4mra$};(^ia zbVxE*w;N=WYJH#3&N9gj)^J5TAD9B)2f+w7_#W@;H?yHC%`K$uDhPXeByxasfyOB4o|4;4EyQ!^Y0aTi7eWy z9zN|MStSg}J3~9tD2kEIxq2^a@WV6lA9l*um~*O$TP&gn^Ak{sKHOid8%Ed?6_{~r zE`HqG(r>Gpj<3@1+qLdVBrx7y@0#rNxWN4Y>>^YnzUQ%kD*9D`Lu!Yt6h7UD4Y|F# z13F11AU-t^22`T(xsmEpM4)O!0~t%wP`W>47l$QP(tL6j#uhVQeBi_0P; zn3vNC*bL<9mga@pyk4vrhUn3l_W~Eda*0XqL)L=8O{PZn0iR)EsC=P-c-PD8_WeC~ z57EVoe*0h4)YR6}yEQnFs&-^u%tdTB3II592PbZ&oF`tpnkGHM>9I%6=knl z)cmWyqSu-YKTQ=y5AKP};mbaQQX2d?E8}(HS)vY!j*s{q@yL3kx929iJLaEWY|Qng z0YgIOndJ2_Yoi=iQ@vj9=MU-5@^ZbdjA@plcR^4*L-XIpoL7GsJ^#w1%BImS%ROT{)+1wH(Sd4cUueZBszb|doJPNX34l<dIo~=B-iA6NBKfF8gYU37{Ucdn2vm zbK=mucA-xrB|VnP`-fgXp_>P2NNAmg$dgfj^24)HPopRU=PQuN_`O?+<>c1N_&6=d zcq&n&(A*g|F8)&Vs)U#8Ickk1wqMtDmRZ-TaZ&SZ8AgT1Ouy1k@jO0}=;W z!|!V7N!)PRIk|K9#+Uo{atx{5ECUnYgbAwNeJJACNc&y)0xX%RhmxJPg@b*%r3S2B zRzslzww#vh7P)?_Z%Gqw9%-9Br)@hTCuNE^IL?4(>Gv(wBZ!K9g9UA|0HBbXNT!H) zj=X{#U??FC{`h*es@0i_6EDv zXsw4wA>I%SorATtwL?4TGtb;=9rfK?x^1xjQ1BVgHzZps%}lhugR^zwntsQbUKy#0 z6qyD{0~>_UZIUE9svD#>YBrQ*dp^}!5Uw=Iy!n$05Ly>0nbof;iX4~Non=x~R_5U~ zzWvaDobYnF`i*!Qe(G1#(hMDUFO!0tSg1y6C}sW{DkeT{?h4!i{J3OaN=4kuHr6}C zawjH?ax%3PGaEi{C8jfwum@}g;p>leqe!WH`W!~Af?A$eUK8K{ENDMy`l_dU@go3` z*G`;1iD_nh-e3Rj@mvpBY&ZFTv^L!CMeK7zA3wsBKQApxwX~R@3ZcJt20v1X08zpfR>BmTdel=5|7_b82P8DzU(2}dC8nzJDX5? z{hE8(rO6Ad6U|_>JlFo>_}SpbFhqTJS@7Bqp4KJ7|K)?W~_KHS0!Fn+#9*0MZkO= zC&XHxn$rm>!D!x?Z)C5%wPf4YpI02^GL?5@XAO(pE}QJMGf3yXWq2ZE#x6kbv`w&5 zM;`IxEORyZ9e*x3|86Fqtwn2|#{H5c=MU$&>j7=14AIh?cH7Z#4R|M073j-$T0@{S zTxcSAKZd!NjYT8EDpxDNMEZunQvccwDo>% z%Dq{5MrV9$DJj@})E}qfYwgM+PiCSilg|dhP~*gzUa`Q#Lhz;uIy25Ac)8{(67i%{ z%@7(#i(p-Kt|aP~s51KW(Rz%SI5zB{!>;rsaAJ@A=aSd^kYuN2Gru4HcoLSCdz*S} z){y?G!91b_ZncW`1nrEAHHW#6emc@?J!(!@B?@^rf%27~@FynR-&wBR@_9}dt(F;% z0#`8RiMg73{)hL(`hz42F=iff6EzOE*!hszNy-gd@#UiHO6Nh_dPf|j3O^Ri_jl!J z7T6g#tlp5BWkky&?slIEjEyeS?|R2TLK9Fq=G`W1mYt3kvE6;EaSe$iiq7ID!#4Pp zfA{qs8o{TzHS)L&xM``nZu=u3Nt` zBZBxhnJKqx+Nt4lf4>b=OT zc(6k5(8MpXbh1Gv-k%1~?qK@7==b-;OoC-V{5&hjnP#e&XT#uSN+Zz2=q`Lk1(#%% zv1hAshA!6yqe-fZP1DdG^XXdloo8FvtZW?2k1;5T@hj|1wpY~7FQJQAGqg?~_j7bR z+$;<(y7S^YI4#izdmT0mDyF!)y1MRQ4e3L|m4T|CM669pHjcjy8bVc)_Xxp(?&d3~ z{?I+&|9oENBBy~><~0X9a2KBqTwkzkzmls<&dX&gbME7F1Vvo*_TGy7s?UmSd%awk zb#dOK4DJV^BH8Ohw$U!LuSxH|lOzeY#7%%QRs!3jmSjEd4x={0j`il%jfXbo>{?|C zGbxUH+p{JLo2Qyfd^Sr)P>M`saeY4D<<4)f_zgdO8?-cgcE8pw!aCUcwcWLr6M&#h zx&WRNc*$uZgjM8+O=qGoYRFUaOs=gZWa=7B;xv_|R=zln?WV$dd)0DHu*ZhaUXQZ< zKvPJC*&tu8&g;N~YPa0_&$X3FhB`TVg!RTCOBWpzdG{Q#xXwFLq0_jBtVJN1Hw=fP z``4>o1CJw^kaGs2*X&Eyo?LX;bDJ}@&P*g*V`St)H|KC60u&B7TM z5lCNT{o#~;Pv-Mi`C1+Bb^~{&7BHnQnB3tC$HsEn1VLTzTK8z@qXVg@8;#N>P+Fh@ zAV#+nXHL!4y*tye@T0}PHBoqL?C$fyZ{@a6#KsyOyJksZM}lGI*XWM;!Jg6DTXxBd@{yb#2i|QL z1lUdQ@{u6+Z{FEqiP0I$(X=pc(~H9%$nPIVS2`rwAN{z+q25r3foP)IjYV-TaCM>< zPMsy$a=B|rWPMO(T>&6t+XeBF?#}C1?T*eJ!wr$VGd$5;u@-M@P}zC>1aU=~MaM{q zOKFMS6s5M1)KY5P)qI0zG*Ud*Y8{P-`8*tC4pTf$*UZlInFP8ut|`>6m$U_LfwqYE zk#g_B{`$)^Z@uJ|0Eq~>mV%O|*T97&_=L^bFYh0pp|!z(;w_#b>mU%JZpH6uY^+Q z{Nnap+~#~scsaYafyv(XJmt&v2#Ow4L!wW6FA~3`KS3G0BIOpQ?+`Ha;T+Vuk6yRM z4Bd?zJlNIkOmNhjzS^ZT^obS!8U0IBy}lYZ_s%4b`1sS;zo5c=s;H3bj5?$}Y4XC( zEjN`EN9xVZNrK@Z9rqoEf>O&hR^FthNBc7p`SPUJx+(Y9La@4}eWjMl{iLA#iI@oU z7ulV&5I*4XD}H+8B<_=%csIX&lT43gJ5l%LvQGzlE2`8}p$!j~A{`rweV2yy_oBI^ zX}wn_)z!K7%KHsk6HZ3lC6Uv>^|I|Bs#s|#4ON&ql_eZoYx635+GPGmi+%_1-5Yuh zbK4Ym0buQCfOO%XqP2O|Da>j`c3UaoTG#x>%Eb-g zW?G^3Naeis^yhjI$HlS^4t>AK_HAUP8*pJX;d?6!{lg4#aa#Ek>1Ce z$}+}$sfYtU69T8HN5pOZ`)wt8`C`ZkAA)Svd>~o~<_8n!fP!Q>s8_W*G#K5stqe47=9-EQX9yVn}w4NJqr&+rged739H+?Vn!nbYKRw94~dOLKWlDKwQ8 zd4%IkMEG}C_=--R%2jD>z{Fo+$YAkQ&BjIzGS*Lu3iXx0s3ZOqY;EF5dz_gtVBBY` zOk&&%q!U*IQFK0bpL5+!hcZ+2k_h@(4(@*&g)q#n+SA zAbuZ^zZSr~9d+#zW9azw7S8rL^5C*q_xb$F_?q1Zd@dO%&8(sp+*cC}QcLseouxOQ z&{7>vEc6!V`5qXU(O6&jHfi?oCsWZuW{O?2c}z9qKKqA@q1n>oYl=MOa>3Tc1&=^D zz4f-MT?vVbeObpvUn@=S`rbUHxP9|nuTLdf`F1nY9qy}(gIbrkxU85LZiSa20aJ0? z-6YygymMwGIboqyqE5i0fNTF7zW!dzMfZ;Z_(SIa_32OaYrshS{Cejx+g9v|#9`5t z$M_Om1*$3ao7BLWtNw*x>+RTxI-S5hymZB9%dQrbJ{KXizmh@vF=7cw59)>*G}g&d zOp}t}&%L2Djdyrc>A+R&m)oCT8VR;K6#$g!rVIu6!PX<3F`mOhyk`jPy!Y(;t4rSj zyW`u^9pX-lW9!p?XGe-t@=B|5hjhTQre0$w!)iMkR-Ud`P+)lGbKp7hMO5@WZf6SOiN$6s`=zkiz?`{BKy z#Ny5U9XQYZd>*%86KHb6wtCf-DTh1OQ87k*0v9}IUq3PEjaQa{ zeQmK)UAeqOGqW^4~mqBplld`mu;u?|vlr%jBE?9TD$Q1EWx2`;+kLQ;_++%TIRDj9>B*PbK_Xsgq z$Quh&0yU)@vWv-Eqb?9ZoWoy|x#xPN{V($ig+dS2zJH>>xv?=@K*GyabN}4vHH6?( z8uNt|-yxKxrd?kh$r8bwA%ARm z|I4ikC@{#OpC4~4q;k-wFHO~5#y`qrckuOiTh+cxO8&j^WqR~d0_iOZh*>_%J&sEP zW**!lcJu8!7;R%puW&>ewUOuBdU4x6P5{!9+4Pox5LpRc><~yxoZGlBOPe)hKR=t! zgS}wH1LQ)j*WBMg^d&1>J>M(+CO4)OSSVlgo19y8`<5!n zTBgb>XgqJxStfGTqA6)b$Y2)$3k5&o)nmBM(vlI@WIf_gM8foeR{RWjN2b&UP5}!) zg@L!`1FR;4^gxhDXpmVr0OZ{s?#eCb&@kYDcc$86JQzm-*4Ie;Aw&p6k^CmQOT07> zcE=F%n(ijSe!_5xeS+a|V`GPL-GI$M)#0jHw)avvH(1h)tyY># zd!yxiVPAZWH=T16sn5^7-w}l*Nv&KhuE;wc5dNRJn0}cbV;t#J0r!Gfr{?bLIm<-S z&CRPoiBFQuXy*T+Q8U9d*)kkkIaX3fE1GDfCC<`(FnU!sKU-Bw$9=>(TFmN&qV#JE zJsoCYVM0RBqb;T{9i{xMT#ZM*ZTAnoAfE5vl_)QL2Y}+}TGK}uuVrCHLQmr7_{VZ` zvkK7Xf=;Na3(yI+TZ%KHckHB{@Ns>auQ-edv7)`u#Pv^T-D7E8+j-rA+cpd;MfQ+a z&Dp&PXM7jR6H>Q5h)I`=Tj)wn-aj6EL~Ud>ljs`W5zG5tHW?Xv?}1Ko(@5nY}*&%xnQ#?Cx`~&(+oj{6g#tjd zsRv=6rl~d3o(Y#>Cy`Eqthl#*u(n5DapZ#Pb3@iKxBWo2PuA);yrW#ZhiehM#E_SHexh+0K} zJ4>%Ths|w0ue1{i0z?ZtBS{?qQ7wh-*nM!=DZ27K%t2%e8!-Zj`kG?BGAq!>-5laD zRvTQfo9G5!~2d$RL#O90ZeAxOHCG4!Am|J*Yv^3%7ct6P}Gv}Xh&WfH9I=oZd?M;vVvVR zCzkpZTCNi4nZiGm(gxWw6DPXm>w_!a5Z-&*eS_sATl0FM-$5>DMYCiK*~{0tanjvv9k-=am2XJNcz1oAN#ibodO)Gx;UA8xvD{a z+dnFtf1Zy)M9s&k)mHd8X;vc>3P&wM=zSz;y^?o)gkeZunurLG29%j%xGG-UeyAhOeIp=Qj3n(!nkRa0~UvN+dtI|^%N&X zg4jd31OzaIHVPcgfc(yqeXQBB$aUWu{K1BF!=I6NX;OHJfSsh4s`(jKAAEwF4Ey!wif#l`)*^(mWT zt1BX@Uyp8$tvnK2?WyKg)0~Lg2iq5;;7Z#n>Tzc1+L?!vQV`rQAO~~qGlGg!=T3}~ zcQ!D1Xe?FTp<2vv>nn~WpqII-nvvCFBNY92Bu^975mbV)SgU74)*Q-*u~iSX1inAW zo7(I=vCf0Y+CVh7;k-@BC2!h56I#*swY3o% zqEHLG<`PK~k;*H)62^A=wS6v>c$LFyHoZ;~bF~dF;`>hhD{*mH4|A%*KHnR{A{{g{ z2M|8*xVYy}O%hCmtMD!A82Wa?e5Y=dypw*F(`zhycbSBT3^@)v$fZ9+(mD<6{t4I* z^%A-Qv8x1=t33>9Q-%}?O+qMlkh(5(2LlCK&z87TEd=_v0!*lMND;wEo{K?Zyx8i~6=LYh5 zvdT9S^i(e^tY-u{T8#+N;6~8X;~>9S+bF0MxG(K9ygHH7f4!}zi);W_3D-bsc+%0l zb#b%ip=jKP7`)ExaZ0Y3zA|g(=nrQ}27-W>Kn*#7BxC`Q-dHPFBGpVWe;-S12J52? z@ZGLwTvAE`oy-|n+kV+G^zs6*e7TJ3JPUUO4rQqg%^(#3SdAVvy=t-*)z?0{OmT{d zjUGIJaKW;pfK6}F&iaf#6pohGBfr3Yv zb|3r4)Sy-{+zkUc#`hu|?{jpLs$HKnO>iQ^@>vY9WS)hk_g(})BmEy1z`djc@@Qdh zDoA`ZGGUtW3q+jFpImSWUg-0JeCU-|yTV|sW5d#H7=BUFVbt@Spe)8=5pX%-ck8UD z>H;MLNmOcLAbb*MSku%MnxiGK=O|4fi($irok093b@QCRh-)+-&Dh(mEHTOsMCQVt zdd)iqn=klM9z_)#NOTGPZ%X2K!pox^fF@w6U;P&|*0qf?!*!EH zrZM*?g516r65dqe#=WiKbKo|-I^tU38!>+twOnAXp(B>A!_RqH$3q_P!>bAT{Ic&g zg1YuDBsvk1=H)$CBkvaH@o{4bFU^AAn12xvQtuIZW+z*pruiuXP^~ zvxX}t=B2QW#vHDhI{`BRj`sRx)|QI)WPAO_!yy;&OSrS;~0Ruc>tX8i(z3S2f@r^Fhu)nUHPt3;LK7ZT^xPgcpwX@%qZ9odtK(7*vy< zyk#Kag!}c5yz_1+&F6{Qa_d?)EgqKONGYE=Ld| zsLpQp>8?5k+|V^?q5huk*AgK6q5j6CNdJR;1Nr_=nT9MK{cds<4ZSw`{+f?vb1{vH z8`qD0C5H3pZ~(s!KshXlu?Y|7fm27rig=r0MSD)UjrklNqvT1np|$GH(n4wJw$9zL ztI2e`ToL!l#mXDXRZFs!2Vv8zxiaSKLZ^TnS)2c?E&mGHFA86qH z^%7t@pcIes=HJ`7>hld7!{hstae0{+hX=P^5ty(6*7`pC2C0``g2NG=U|XH5wG=PG z!y_@#UOydo2sf?n&e}3hPMDcyj!tOJ&zzfjZxRE1qdZlynGIMfg}=We)-kos zz!Im2^W|+?pef#ay4U~F+&7Kbrqo{hw0O=`k8efX_Q^k*`!n{KY5enRtRaTr?oD)9 z&i(BJzyB%$$He;jGIOD0u=S_f?Ys;B?9VSIV{(Su8D1{y%H&7CS$coY5e(mN=4fQz z7{nD9I!hgXjriZ%Ir+~= zZjiwvAMKwl$TFatH@Ck1b1vzNVR%@!4nIuJlk`kD_phG&$27cPhUtE3mOg0}7i{fZ zrd#!!T>IDRC6N<^mG`M)^3a&aH2!FtfcT#yC=dG#CULBi=$6v@Qr}l#&ir1Jf4v7x znB%@oFw&K@Z!i9%H^&fb&{k*OpU+@zjk_cKe!oDR<%-|o{RF7K0M|uZCfx+ppRW$V zQdUVl1RDAQk{aK8vi)o61;moZLU|e>*>zxLBB2b=)4{p`@&l+QR)$iH8G_^#L-zHn zY=6F{7$%>}XS!YII;@idl0O?3$aux%9ReyaWg-)WzJQ)1Rj+V__$<~&#{zE&-9`zb?a$U z=w_SbEH3|mAYf}!@v~!tM%W$$S5O<=`q4KMK%eY`8&xfUf!7`ief$CHx9%(bo}OTV z0B{3`!A@JtQf@<&M9h$EX_$cM@m|rC3A*swA0BfpJNBF_z+we!dKQ zv$ZsAKT_q4b)`i&K1&;B2LY2&7Yqyg9yAgwxj2A-@%|mK3d*3S#+5XuzI=U4NDL|l z(_{$JWu~U1FEgiz+?&1QZLD;HNL$HvumlyhXp6~tCAQhCt}0EH>y=h1l%&$mlKF`{yEe+c_NQo&88!JRwstRcng`jZQQrQNmAdgws@N#Hs5g6G!4 zb9(x4-}mtN4ccTrF{;#@D<%`~0-jy!b_w$vQvqf9`;&_*fhpqo^)Z#rR%q51gX?qGu;_ zbUMf>XU_4P!A50Znne6s#F8##KY>_zH^#t4K0DMpQ~~G!^XMcXMKL%wL5w@^#-o!A zDJ_J*hPV!aW!mYNT09fmUww}?mxr)uQRCrmZoWZvH;@XxrZt}t|9J-z>tvg14I4MQ z72Sb8kT8$N+N3^qfeLi{n}vREq&AR=RzbvFMQ?JA?EdM8!VqPfV3yDu!-9HT*8B`JcGqI zux z2M*Tq`@n8J0GX1EXOEyZUwPyH6i*-2s!kONp3AOZd?9^i@c005^OX0XlGP$`Rk+DA z#Ea?xtm(w3yuZ~RrgUD|C1YI{O>-HDKu-_Gj}B6>{->btzUQ`wYB$<>j74g6iYQA^A(-o;LsW6_zd-Zh+!uXfj9kc4qofmrRzwNhy}Xo@9n!m05E7&(?6;}0qomj-~9PVTUx zL-NO-m3jW=*|jk%@WfqiL?gJmdJRkASo)ISx&6_5wHNB&+~~Pp|Ea&w*pZS$tE{)T zH)~H_kNhDMS_}?JK@c&XshKY@^4*zI)xDWF9TK{JiiQu)xfc--nf>D$p)V%j1ePtK zdd)fQh`VCthQ+3ju#vHXkBy-h+cvHdnkB!P7C63Yg%G^-v9r^Cw4FkeN~cUWa1h|n z@AI|y!IN=+gM(ky!&ASZrp`OHJ?=LyoA`;fD@yRSmw9sR7C5A$I7`5>abryE*4I$% zIInCg3>QG6(B%MRAe6RSiqSdWV)LstIfxCT?lNojVTj(A+Yk4-^%?-@i3{>Rd?9m@ z)o2bUf-#9P=}_3bm4Y=byEAj_Do^bq-?kBzbuIX^qdtY*IvAE(Na`;*Oc*=byfPom z6kY_(ZvC4#Zyxr7+-J4Vcyy``6mswgAM+y?+fey~3M#y-TZfBxDKbD8 z1V;LbfQCv`-B{Ofxou^Onlx$8$>`h{{%D`*wW+pReD_Cg$NL|IEpq{>z)yHguTca| zwZZf``(-=%{2|+r(U)yt7AzadJ-e(?Stvv<&D;0B9drP(MBW)Xc5EU*@0@VMSvZ&s z=>FKNkf6Z&IbZ);0$kEzC)7`(EwO0Q=;_QR9xTCh5k}I(G@+Z#?XFg}i@9wN`kDM{ zYkPYc$MasobXkw-sRgIHqhcXP8#@y?8X#rCW zj+@gV*1N~j&&y!kBacMTcOLpTi5wVC&P)4?B(d!l6nQ(z`w|x2|Kqg0{Lo(}{aq6U zq4NAlHs5hRw@?i;H)bXb)W$M-etqpDozYx<9+a!Oo%ixHst}+ReUQN(bzakcw8dgY zso5zR@r~W@Gcevg!qci*>Ojpb{iChqjn`w10) z1%x-DFY~a+$0mEbct;z<{Q-yh4|EXs89^0N(CuRF_bCW z%&L68*O-d`i0sQry+Wh<#XfzR?ujBZbGcX&2dOO-A4F8y8rFONPkSZt@bS;9-gkzy46rXmaq)(=Bx!=u0OK^9^MsM)SQ@B%VihkckxZ}ptnAgl0I921zZl6#a`q>+7!q7*H@0KXSNVL z`M2I-^9r||8ija~`;k?Dfg;}q8-fYSrv>BXOs0*Y1HKXFDu`+9;LD$9jIR}1dlimf`q@I_gRI*9Ey$jA;gt(49yGo5sXabV~NH94mmwqU?2~&QfwT^T`2VbgB z$%(q*jE+waJ>)>F;&x80a1%}Ql^kt7&V0eF*(Lv9fyP@Pg8j4D3rY5GZw0V$sf@>06_$TVWShTlhJa>|*^;F-u%a*nIzm2_7_F2ZVVBPl?S&dii9hP}Z-w zvG=bi9BGy%(LMH#@W&>68-6wqE$i{VhnyFePea>u&z)Ujr8)<9N5r>}mw(?LRV=9| zaLC@?ljiI>U3>DR5Bf$^D7*T}%;YR~yh#XUjxAVdD|>9ld)&>;d=J*KTQ+IU+dlPq z*!H8SD_Q5W`p+Si!`yyT8hX@&YnQAY?vi3P`;CG>{!PChZuu4Jkncf4YR z{aBB6$5CbDL5Em&0g!Od>nVR1XmqgARF-wa{AUvoct+gyey+Z2meFB5n@%Cqp_wS( z8(Ee4kz3XiFMe#2a*4~iATT$OSwBJm;(kpjl)@6Q?cH0Y$a zm2!}|W%`5Q?@e_pi>WgwsXg0#p}Ln(al1H%vYblHQ{#a_P5iB8TJ*@>F_pfQ)T zCLNfhcHO*&EqWkxhb8ON$-OtEl?lOoTX`T}R!H$q zPMGnvV!0ekg;nx|O@j9qc`~iKZS5}N`f)aP1FG1U?7TPU(nBE&TvC0bQh|@cT!HsK zSoelZII`EXof<@(6!IW4RK+%=aQflKXXK`qKzP&Dkn1%wDl%TGNtIz_J8vAOEo|?MBCL~DvhL{=wmDW656GDF+;Bwb-186U?WU)Q_ ztSF^XlbV)T-`N}bP-`?_QXO9j^GyxM8zmIo+bOnfuuX@aU3xQw!fR14rR6D66sX*| zqC}je(XWyHiY$-0buJ(h=;cf3ky3GIwSrFZ8yW zY#2wg`rU;K&q&Ws72IX1fJd9y2@SnpWX^iV*(8K$s%k~WP74UWkYg=Yx$!#ft45(F zaSyXhHWQkzGbCwW-I_>tctOhKEk$eEG|$|11w)KmT7BemG6t4j`9%1Wr2c`bMqI11 zRblze8pa1ucsH`iGUCv~9%)%OF>SrPCgtLu>k5<2rr2Cf7-j= zIFTz4)Ni~{inz%~C>Q!!EypNTo>?s_M#(gU@ijBwt0rax`8Pw_{N;@G6vE4xpC=O) z-$gUlYL451UJtju%3M&Wg&KvnGy0N5j$%&WzIQCkH>i zocz=|hSY9b%GZ}(t0VE7^i}WMoTZ;$t4y5}iSoS6tFk%D9Lru~+(C=0Zn#~3$%sc7 z{GpenlS;p??O#|E_6C=`gG(k9^*F0c7=`PN{>Xt+XBSXTQkT1Zpmgi>I9(>X#evVG>S^0Vr#X%Ek1PDH*y_sU^TYnoNIUYA=`k4%0L zmzJs_q4_S621h=2JuNd-M4Qw>FGxn#`GwOZ6L|qWF$QHnZ3RJ8GiM@p-g&tf~@OrAv1Rx!A_{&OgR(t#u>_c4FSevppapc+uY6U#P&cP z*GMQrfoz7EF<-azgRGAHSGJr2-LJbEDH=0#7cd-}qdkGssH-RXoH#v}?39?;-`9DFZTx+!QW8yE!wcqT96h z15SvBpr8MY4Fq4sQWp47Mmg@W4_hWUX-wm%BeOE(#0Y(w;}GmTcvpvF>n%>d?Q>n< zS2YyGs1H{;-!10OF&YI_Aie(S!;2SrxxQ7#;{B1<{?`Hm_yw}7Mr$Z((F8^9?d@u$ zJ^a~fUiJ4{3mbM@Q7!CAYNuIli!Sloc4vg_g=)@c^yjO)=6c<@j7Af|9w_%aSTHUS ze#|&Tk(8yOCE)dW+bMnF#t+tr=dwzQ-$%{uf8G0kL)dXrNMC$Tk9VE76E=DG@PRjV z{>QQhw+&9>Up`$1@P+$3Le{E+q*w`TXaG{geoGqswY}+Iyg+EE96_OW_eYyS*{ram z;{^>!&2l0tK=`Ns4et6yfP#;f2DL!f^EkJ{i4fMsVwrAk7q zgYbzfuv|EZ(VK($tD*8#=yC-tQ`5GD!T)~AFfwix|Cs84?+`kJsLk?>CKGzc%VYKQf1lO# zFR)wr=b=CB%|(AaKn(&9TJ!q;$N&D3->n7#jV$jslH{So@|q513o(g=v>+PFhMZbr~bE90zu>x_aWBRQK=tv_wV2SJ#Qp&9|ZolQTTr_6Cf5;OvtPmV^Y}j}S5h1r!F}R?ZUlV@!N~RW zCj$9ib`eP;L2!qB2u;)!JXni~48EYq#RZESC7bN^-?udwFh!Fojc-)XKKMdntG8ZN zAH1ljDt=x!F?Jd<&BZjV)6ODHu|gqKQE|TmA%7lO*x3nSB&1*sr36YSTe1SN zW<2YO03B>!@XV+Ge!Bgh$W%GpW|TJ-D>`E0ldL8Uv|k}e#Py%oHGL624i9EMda|%o zKi(kaNKe%?p1R}aBo23m4wl>_;5_|zF_hnLzr%x@6L(1piA3}&y(vv=;#a=U)EpVc z)(77)MJDe2%Qy{$`Nzy>Q}-%KBJs>*L#&NLXQ-FP11=#<6`;%Ls<_|LoBv;z(+E6I zpS`|+KI9f_TK`IPE^nfi~cDmqMo?;e&$M-~XsDu%LBD_g_xaGaiR2eqC5 z-U}dUSEpxXlw&Ey+mV}(j+FoLoss=C_U%$h!vFI2%TfNU$=Me1i1I*FJ(KLz)Q6Is zU{N%`6m1|w$oxHc3|SI_+-o)}faFb~C-o>$|%A57rVndqDWhNw-|F^zJ`}bQ(ro|!nyXrn5z6MEJ z!|}nvwbl*<{eO&W1fou(D-b^`JU$H&ktizgG`bnP>i(tQ6pC+bWk@_p*pdzSQLt6= z%RKyd{y*N{Dz3_{iyKv3Du_V{C?F*vQX(DFDBU0pf`GJ=B8`EdbV^HiFFH&*q#KlO zM5ODC$G!Kv*ZY0%xj7f-PWdV8S#!>3%n|?ipG*INvMbr<6Wg<#AyQxQ+~~_ zmfC)gZ=z=rU03-Dc4VEJ6~4m_E_t@|VLi56smB*iO6xQE7+(R9(9rqq1?HCibTVBk zqgWoKVa=ys^}$Q)ch9h$v?Nc5#d*`B^&LlJt}b7z4!R}RYREkwt-CV3+n9Oat&fN}LSVmz`DWr7W68{bHgPrMO_nQ5-D$W! z+P9K;&j8Vs(wocD>ToJ(q@PaK3dCGG>Pz=FS+(rZ5efum_2mfr-|L z5!LfLwN>9XXDAEqgS$}UQ{$)}au*lZ1M8(uVV%`i7s^b2h{+19OSC@yDT*tZxJC&Y_THZW$oQ$9XcQdkKOZj$%V zwtu6P!(pGEsP~m85joP$cfbFofcNL{h)fbxO#Wh82~%|LyH~vz%F3F@p(d_|8*!x( zMki^KR5{{jWwW$;?COulZ&y1qvG9n+-1iy&CFIUrtIpIqKOvlI&&Rt9XPIsyp~MJd zX|CY!>^7P>4aARG{}`x#!PzHSo%8uyq3QkM3=o&_zMRGFFs}qYr&^j*cDIoWN(z5x z<^>`1Va*2fX3}%F*m12v?E%GD)7aV0a$uuKoI5>5djqx|HKrN)*HrsYSWtR~Dzj#f z-CGn~>Yr^V5DXv}S?>dDn;lRJTLvf0h6+Y?9n2rWUdZ}~{RU8?UC8TZmTa)%> zsfeic)~NK>ubgU&tma2bO6SNr!r_DTQMxf^FVE$#i*Q;mZLRe0ydsmE5cb$J&rwae zJJ^#|ZMLXo)-}Xo73y~f0<_T@TMot*`5OPEWJ>$7;kyf@!((;&HB5P`8r`JkAFJ|i zpoK0Om}}LuwUYE!IjvYk={ac?ia$m$n%=Jon(B@{Wb>4-FES=<00Nn5f78Cslil>~ z%KbE%!6l{axqc!hX8Ygsg^fmDUjS4VS>f8+g7G#RNl=;94u#`vZR@+pT4FX*s6*>2K9-$M~0v z{K=Az0ckLA-UH#Kcx!5Bebu!hlEbskdT$aJVFQo8=N0yb%a88oY%wRsd6_2+4(3j> z{oTX=T&wy1o4KK>l*DTwO5p>R z`gxihw{B@=q8>YL+cH(#+CFZp;fD)EOvA}h`sYS$FBF1*nsv|qnohiV56x|g=d|?5 z?vCF78y)fI9O0e9ns6^^t8zRv&^BXE>GIc3Nbc=jx9|5Ys1m=Nwxx^`B5ypwpbD=I zJAI)|6W$1~O&p+=H0Zhd+^-$1RzGlkZJi#&;%<+58BL5I<0pV^q&>OH!i2f?1m1wm8ItW(EDlLIzJDM*jP4TC@<*Yy?CEEC zM3dM>Z~W#Z#%dNfoZ%YK93Mt&q3-*^%iE+Jt zj5iIB9n1f2*lX|u%~y!`kXkrz`ov$QdYkfoOkGbj)fFT1Q&7woE`P$!8$EeeGcU^E zBCbeBZ&u`3xoz*4QnL&Nv^QJr!>C02zzdZt+yYs~_i_>vY}U?`5l2ppHxu-PZ;|@l zJ&*t7wWu3_gWIDKb9(z67I|_e!7IHC)&=% z8P8l2XtM~sN*Jk{5FI&QQ)x2#SX|SXNN_hcO*YRO1ey#t7rE`5@N%>(i{rbq!c5e1 zOnWF@Xld*j+k0A~?)|^*!VA$PL)w^E{OO_`WWBCNS5hKZvn=Fpd9>v7+oCNf9u_Dh z%p=JWoWuGArnN+HE{zy3* zaOd3cdTzBWOSqDmg5N9C623tdd;{oG3$(HVM>{`3Swg>P*_Se~BD&wX?TYJzhVud$ z(uDi_k|aY7!pEhjC)~~0I^xW;Q_2h?ZgMXsxGtxjx$tQ?6f^^Jb!(piuWKV_JCOJ+7C{z^hkE_Sz>q2IhI*y!-te0uD16*0@FK!w(XTOU=9 zb%I!GRLCCMG z7ga&DMUZgtU6fgUftT^_Ii- zs?(2VhP9ayYEM~|cf0XED#-;~! zRH?fsf4bxUK|I1Nxf}q%mEs&694nv{jsE@nw|jhyQo%70#<_uyg0QLq)o6!rWbQ%}#$^)_0WXZCFs-;iFJ=`|K4?P#UKA&-1eq_dDu0p%$ z>ACgJ(Pa{ctS2WbzD5W8yqQ72Vr;r+?rZZOB62?VfFm~y)1tw64}PL*W=|1B_x7o3 zY58y^{m13erEx3XNk~pMK{X@YL-;l>+{d=1(2E` zIGIK!(iP~Hmk}xovd?c2u6!7;b%o);R4#C5odBgoJN!#iQ4~w3T)Y1QkEX@po<^Sfm8xJ%yF=&d=N~FW=Xm`>8y( zvh=n3g(w%Pk_7{?78dy6lp!)uy$X`4h-U%-U%xjg5csBKsV)QY(Pn?GzW&X(Kk!_@ zZP~|x-rsUoMnW-ydd_A-$XV?G<87x_Ua|Gu*HN33-TGLvRzJ}a!AKZo6hn%qiMEhP zr`wmT3EI4#{>1xFUjJ98<-pT*4$Hv)@OYi)Rb^}V$dQBUIXxxow4Bg$l07QYJPRFf zWo|CW(9ue&RaME~lnGt6AnMk;ITQ83q%z>ukK?EA{m$1KP0Q`e>^l>(?LsX? z0nb5)?=;;cxG*_W%iTJNiUbK>$-X?x3un$_SGVWZtn$C9+)&6Fs&yUAi$5Jj0cWGz zxLpn-a^s7LuyP$gvlxz)KqriairrOf9j$Aj@SLsZR3Clx(S1FuV;s>tquoQ`!e^bu z61#XncE=)48Bl|OqaBDFVJZkt;sg*s0vua`;K#bY=d?Py(C~BRTjj&%P`B9gnQsHI2~%fJVzu`bMXxJ9W17i(xS}hE0(`LoO)kg;hrm%&)FyU<|=oa zh0_9ohNW52U+kj)8jXmc@PcLKCtnV%^o)d#nkQsxM2^HA9^HuL-K{@!PG%uI`YtL+ zBtsz{0m_wvCIZ4oKzw}cS2LocuP~77*)`LNE=BJFDOfX6$bED+JTQ0nmp{@cfVbXq zF2Ob5-h8*A#${u)3hfJs+7X0u8*}=R@KFeGp}_^A93Zv>;LBhEQ?#*d1KnCW+KxN26Be)cT+W03@VnVp`ecJ1b!h@dfO4ozGX{3%lI)Si_X1R85@ zlESum**FXg3A5eNh+%`|y?Z1T15&kAhuyL)OO2HA94vc=_4DqPd~Z zkxa^S@r`3iQ!$J@G`L6K5&b(zp=!gK9@tbJxtVkKbbYQ2Iz*9PM|=3@y{OWTZkG@i z0Qh$BFER7H(=|wR-+E^aq=bTIbuu|I)ubt8;Dq%~;yo<-C($X@97V)!4be`LT<}e+IU37^vk&L>AF!hC@RRyZJgtB{I*B8{Al1@k529@}XKRkx{l#8+J z#rp1j-4H=4duTHkTr=XRl4iX@Zt>IgH@7@7{f7 zFXVo(-k7faz&mg$zq+Hn2k75q!S~{`!HjAIhQceLq*Mm9(rQN;w`!24SOyIDHHyrN2h9JaQ?hrh!RT_zkmJKxRKv5^&N$0m=%&HK*vDDmT9s*E!yPUFZ` zDFm|iC8{>j(f^QR%(4r~x}I@VYrdjim7U*n2W%c9U3D$*0N+HOF*DI)FRKGrde&Eqx{+TDky9g8`L{GNTD3Hg`BAanW*(X5)4N+n2 z4qcIYR$R7@698Tj3Cx7*d_=+s7~tQ#@6ky}JE%0u<5&<>PIHGfr7E4v&^43UK>QT9 zhhtIcmLTUA$Txchp#g1-Rj<~9I`HTQL7hIxe%DsLZ#higys-&dOG3rK;pZ)q_jxB_ z?f1NndLW_uexJI!*t>2+y0sn^F-b1@ARsO>J6g5qgK zMmadNl6%>$A~M4117ei{NY4V=V>=)R7a#J#k>9y2W_|sueU^Hj1+a!L$|%0gSNSRz zKZ zOj02b#8GMb42Nr_ShZLlS9VD)F~^c*pwa`c2~;(!GssScN)S&38_TtybQpt%?5R2> z$4o0=8RZn*d!KX~DR|Qi# z@{l&dZ(C2&Uz%2%hrR3e+SFTUA1TwC<(U|athCR&8^xMGB+~Hhy#`uOAZMi9_Py`S zr(Xys_8{So00yxW8|!3cFKu@7$tg1L%e_asQ9W53v6L$I8A&>3co$28#c#4GuV%`} za`T;(5)1a!1@1%#K2e)}Xvs2wpGQCAZ$kLwXNc-%N3$x9hQuyHj%LT0vG+?+wSQ0D zOfH`p=Me!G9X7W`Z&R8!F7lv-{E=+73P$ zXAN6Gj`}2C6}N;H1Pt~T!4*&5yhuM&)Tcq|S&kU17+3I5i@M!{&x9M{EwKp+t1qM{ z@YtXOoR(Y>^p zj=Yt=OFLTO!DQCIzdYQx(F$ae%D-9}ml>dO+5clTHwQ;gn)qTH9dDM$JCze4+n_V2 zqNeI|aPFtfAn$6$ zlx;guB)Uc9N%xZU_3eY{|#BpEb)gCRm`TA-n+`8|P`EO~bZy#od?!-wP zX4Q3nMMFRU*m=mQEa-|rI- zGo*Kp@@xvpIFx;?)~ML05f+^`Y-HD!wUjpJwUH-~xgPN=OCgq^sD_WOxLH0UxJOYe zRm>bTf)R$JWEjj!nP(31QAw_J(^5 z0zQ@D@h*Q{&9%(WskSniDiU%<9CMFR!BElEq`H?f4kR%=9>l>o7uTg1n4aYp&VmKVRVl>eE{<39ddj8Ts~w z?yRI{k?3nQtuK+v+S^VN?mgXWJh6ZN`qy{HR)?NI4Zj6%m)K^wCdprG@!>ps!2Of| zuFGF_`QIh_IcPrT1t(SfFYH41_2AlPj;jw{4{A*1SErN%NM|aOG8+Qq4Nxc_1mp-7q`YzFo(*7B#GwSt)<*Gp*i`dsSR`c!MM`qSvRy zsNnH#h+`3S6A%*G$6g{IVC@JLQ;I_#yL-6iPgiE}Vzp zqnNh6wr2Cz->-;teVy2S@WE=%hv3W^HY?~NPFU9olkjRDkvueesM)a58Cz%R?&y17 zREgBBs;@=rX(J5E~J0q>%hbuBpQZ#1&emRLf7-e z_g~PQ3JKoB6&U`O8EOTqn_&QXd8U3Uo;)af>3Xtd?iQm*`=6U=Z3uHs3p z^|da>xjXr(3RYjZYE++|rnMw~u_*E^#iO&_77x!tNAw=2rKzK3|M{Jc;z%~d8aXxyEh^H+`_s(m^>KZNo2iol>%R>HE6rekmWs`=SoHZ4ML#*(`;+zgLKZ}hXu z$Q7oheySc7yR79GNq2H3AaM-UOZpEMU~Y)4zGmtR=U2zZ-^ELTX~9qZKG<=f{H3W5 zTbS}1*c&z5cXo^dgY8GsK)^83ZKGY^SPk3@FM@u5iT%{%^5n&9xuE4n7t^?H>SwMF zRo&v&3Rxlh)f;v&f0yjgDkj9(XFQtkBZePzG?aVzafdIqvBCiKfNa%d?$xOEHbQjw{$FTnX8C27zh8aFe>KtAyYZU3e2%9nL=;@ zn2@J~5+dR#m_^-rCQ&EbZvQ^OU~iIwA-}XEzg>WzIslsV{tAXm0&9cmQ3hozpd4W5 zp>E#w$pXX~`xJyKo1UMRk=vm6m>F}n{`mqYlHa3-Yp?v`$ksK1<$#yC)?b(kc)hKa zIuo2!w^t?gbB-?%^pxdk8MA-(je5#_=CvT4;1K8ZW$b_ZK}m11rz85asvZ$^`&>I> zMT552J>6k5ee<|Ue_WQq0@emLHdMdxUZ(Z48KN8|bhbJGQbuast4-Y3U#@g?l$}a= z`YD@;U5~?fUa7=!39G-|e!1=4)_iXkx9zl)Wmo9}MdPo5R~I6mcHY?Mo(0{!qR_=r9Cif)?-M8mvl8{lyI=ltun82$LNiJ0Q$^SqmJ5CL!fj&7)L#u zs8f&{6sPMot<|;T;}+Syn-zBVNfo{?f$+jY;}bU&p*{?Xev9 z$!}3L6A<{1^OaU?oSm#Y=;rNHO43V7(Th@m><^pVTkqYwvInEQIT6v1BEX8G-rISt zECT4>U(ocnHOi?tJ^R0lGA;U;XNz0?9Sl53m?AcswMwiO?`&(jwMM$?ZFLP8=oEl# zs*qmQV0=_m?)vL5rP-js-_ELsRqS-ej~Ur($Ao3TCH*t|Neai=8Mue*5avB;)5x=; z>Z8rutK6z6ay_7=q5y04xkxk4^1%|Dj^oylMw*GNR6dW6bp2+~{=r3_9zJj!Ra*t~n1bG*ak%`?a;})*#EK)mPFX20Qz;sTOwDt}`+o zW6@CWM2w7*8EaMX+mOY2A7A&qvpgkab*sPXnf>>Htbns7 zdH8l3c3KMjb6sBbLRjqI^V?%r3Phjlj+G9Bp~?Q{nnCJ=_-KukS6@M)BV@I8ZIYdl zaci*D-026j{yfkYTD8My7UQYAVZmh&ckP@<=*$w#FKw>qOzz$W!B`nz3Jb?mw>11n zKA)hz2fI8S_~JCi1Ha%u$whi4INU1HGvshHZRggQD8N?V=JFz7WJtCHkDf1ks8 z!}Lw%&LquP)Wlas@GigLxjQ467?`cryjnd=x-|6u`I4A+w-hB$M|Q$f8Y69K(Yy19 zug&qF6F3Gum^Js0 znwGa3de4v!oC`XvM!#y-I(=|@bInWO_-KUI=dk4Qd#$hj8aekajjToLk=};E1Se9%fV9r(iKwHMrCturSpa ztDgM=WOV&QTseTBnZP1|fk?qVagy)bw8TbUK>vGppUq_rZY(wF@@ckA^O|DfzS1*_ znyyoG&T$`RUrKP`Sn9<(r#?56zLdC6t#rcs_fo$7W|xJYTTvew?>D|483J#obQaB; z!F*DoQC8mFhh+AxEUqjoQKg(q&-PdPll1SndbUM}GLQM04|yJ$%g?qkcJYpqzeCn> zLw5)>FF1iMC88R3lI;^+T1ir#tPJ%`o zJyH^8AnH2nf-Pb=fhq$PfV|oRyBxRq_x(>7dkJvOW7_@9xh%EER245^AEgN&Tg^>8 z@kI+S-f-Q`ul_~8(EztsM@Aw)`^xu*fH!y8jI@ioBHobU_q2RvFZpBD`d4i6PA2$C z)IB{~RW~4IIYyJ8`T522CP8m+wBbhvM%P?rSG`PaX=;M^{VN<2g@xy_QE?v{Kqvfy zabwlqBXmIp4b@cj859gZ<$@^aNNq;~uW6v|Ok@q}#{=uPv>8HYW&}q{P2M=lDU@+t zy;}AY3`RgT=gpl0i*ytBL$rOmWQ0gK(-u|*dn%In{a!Epi<$W+@Oc;0B=vScsE)$g zJ@R?EHI+c0U#zVmGS|*Dqe_@LVy3G*kK-$ItVWt?+k=(noo!gtEi$8x)JBu|n!=`U zt)Csays;VRfW&^AG*xV-rOcq-L*xCkO)R$tuU@)fEk2`KDwnU(at|7OCuC2W1QNidJ=Y)U!Bp{ zFfPH%&qYAYWmM13yf5#nWj!lP1J#4avid$TFW_Z=PrA#+npCHqnQ&$4+*vFgL^4T6 zfqMka)Hx#~@g|<9^Jagh^85-4KwO%Ha^Z=>rnT3UdQfN~&f8TB4oewcA zS}LIrDtN?T-6ZEf7PDX^Qla>~3#nmp?PqKFjqK8`J8up}zkgPaLJLqTl%f}MMhW_U z*iScwyssL))32^H<>RVIahZ7EFa+hu5uI9__yBMdAOdgFhIIKz z@DMRM+6X!*sc)z>?7~Sgb!~MnB<6oLH4Lxq8FD&YDLoV2)8@F7x{52a( zNA5RrJp_t)+Ju{~KL3`$*w`m=%sJrp=}Ng*y^i!hJObj=`cfJ{;Yk_UQQ8Jd%PbQfmKW&d&s~VlrAdWM_+{^t z!YddXs6>0qqBm1lRDP+N`F(vn@x`vwfy<8-qZAwU3@WV>B9wA-f2Rm z7_QEL@68xQ((7{YLGRnOh%dNp7BqT^Yp8PUTaD}tH-&8j8 zMPrPXNfv^KP#=}U1nfzp4lJ`?#>iUFjw){*^d4KJWL?i9H;3-f3GZ&2BzhFm1<%~u z%o9}!CF~dDUmPfJTdU`(T!i_z5&P#?FMr!<=?(!tnTc$*4MR2%j~aDw8m*L>M$a?6 z6BTCNMPQL4+0^>hT7)VW=c$kO?+3nG&rhBbb7oV|iMi%pn&!B=tRhGzcvvb~Zb?$L zWMX0hT%^3VZyf9xg=H$9((s(*9Re2X&FoPYX|<&RvsAPV{{8F`xkK)+6mFb+&I+TC z()<&f;ql$DrQ%wy2o4s5k=Xofd){5Q8fjhJXHrxfMk8hJiUXj;%)n5cj$@bF{O+y@ z6&oa+HLMt$NQHL->&UUUhT2vC+XR2y=x3j?B*Vj5^{p?{5BKfxZAwp&Gl)a^FG@uv z`uCJ!`R@J2j2CDbmsF3{wx&VL(?c$Tv#gL3AtQ$zP4+no_W~tqv%tFnc6DGOxiJbi zMb_O5CCgTz+iOn&j?}ptEMR<-zmJBojRwrssPGJ%q+kqA%*fCzgz<1&SX;R= zI$3)O7Rf?Y;j92+f_t4b@WsUl9w!ek4;|~RFj={*vMbL2qAXFNE6KdUkW^WanR)!8 z7;AQ*rmo1o?HfK(M$3XGh7V6ta=BHUN}zk zD6_yfJKTSJK3Xm|d|CzsAY0F{2Fl76Wb|UhuC_gV<3$}XR&k5wm2KU-U`cz_X zTZYTK1SrId^7Fs?`#(;@EGpn1{SxH0Jc06&6G31!#a{bN4iXDX%-i5c&D0<$l^4vkaQ6{ zp*_E&j3Bf<*pZ6$&!Q&%hWHafVQGpH<-EMECuxI@-EmYhqw$dW z{ll?=ZUr3uQ-7?z|Lp{Misn5@A&(k1U%#`3*9UsdyN``*u0MbA0yQTE_wrwD#~WmD z;7mTpYyI;LkY_235|@{X;S5Ai4SyX{W|?9+ED*N(lzL2VMbJf?j{ncs3rB!d#m1d& z|NH>?>?T#EjmG4?M>0#P%_rFY4x}POfQV$;=P*2t@lR*syhsUY85nfnF7;JloA1@? zFt}p%7hz2-6ym)U1@0S+mADavqRj?*xKjbF@iXg09}@mNG8H)zvp+Ky-TgOa|J_AK zR@FP03XUy7;|DmB7(f2cA3u4+2l2)cSpKbg`S0@m`E}ShusqaH&ND;3`Ohx>bvXX~ zy8llf9xQy)akrP`nvhGx-=EfkP_%~!iap-vo8y_Zb-*pG3kU6%EeA0{xY(2)IQS z=B(ck&#ZE|wv%p`I0vwKL3_B}Y0b$%`2y**P+B4r*CRt^`54V!&7hgZ!9&{G=Fjdvx@64m8W}&7& znV}sbZ4ZH z5OCIZy8FsiYN9rA8$FSExXqKJ+2v<7WI4d^-GO6gs*xf~f{c_izZ+en7J=#$bb{EBpip>s2O|l6!Fh6`RH4(4o;oinIP>o=WMeaZ6!I2J?K5MOBO^N#1*%#_$~jic z$Boy@9{hSZ@@vRUX?!xx-=w@l%YxWsO-=J4n2CkY*%k71!cdY>YC%CRqifB{_6%;t zzPq?;a*20Z^ktgy7anJw_P;p=yYEdKoUYbkJtD@MXK&!zwzqF9TR(L%YW6CJl_u4b zTN+m&55fH`ZrqIVV*j{DRdD7MVB?rrzEMXF>jlvc$DniU;P_{qj=4Pr>D^bl!eV){ z0o|nLQ2EvYnto zF8KdECgcb%m0dF0WOT87=!>d>=LNFp1d+(hH~k!&LXk)2_1+ z7q{qxJ^Iu&=H7Dnj6eOJQcz#Em8YZg)t~5;(|NMlPM#ctiGNzj2S_lUcvpl;<-%k5 zp4h+p;k!BAV(-=0E1_FgvHjm72KGK8+8GMO_CqEoNqT0X5qgK)I)=pn-IA@IWu#MX zUDEvg?1*_I1=(wpYIe{Les8_|Q+zS~^xH>IFM`f;!dwHUfVNn;7r$e+aYpik=bF{K z*&!yZsk_PAWi)*@*R~Rk14}#c+^LJ0u_bPK8@@Kbo*`e-ca zMjlN@;o4?xwt78i5WhVp!p#zGsnZb77>VTh^W87iIztIBjNrp8VQXHK_dhSUmq;ew zPg_nU(lLJ7DrX!EN;mG$`ZBu(BN-?(q$Hsi+YgcRvL$V363g9h_`%ETwfPc{ZP^xx zn^rrmmA!)*!99wnhO46_7p_?(9{+qkWMnZ}qvVa=drapp`B{j+V$l8*QzCB~5RJA$sA-9Z&Ic8>bPvsLL+#u49i6*2|y|HFDdvOcMNiA)$|Ole-J` z4sbbKteBCh+~096i5X`{psO?zE}1GDm;`)@nCbTCLdt0h8BX{6O84?KuTES-^3S=)*!Od_#rVg6o(rP4 zC>6N05D*?SeJ|j8A+BvEntO1XtFjlu`)krpajIf$DG43Nel3k64VD~mCv~;W7dxHf z+9R*AnVI5WyS}_uUtX$T%bf0Pn{$=YVEKu|R3f+xJ{i!rycQ(TK|lV?(y!s>&)H#g zyLKrgQ+R5-6;PVE#c@<)q%PQa-ljs5xu1suB1r;kQ0S2B_o~jEhB`{=BKGpI2@{CY zWyd>&Ub|%~r^Fq1*mpnDX|qow!`>`sq|)ARcE^21TVQiGHg8FqVQBE^P?l4}n`i&E zd8!Ms6w@RdQ1W-Kn?A?**4{8C$}54m5!HXy{0CcR*8)ekLiC{sl_yp;w@s& z;*(b^$MY$oq7huKWoehLk>asi?+0FUYE!?j<hW!@sQe0$@V3tNfF36%5N%Yfk2iF3Zgf+F* z>;oeQ6-q4V12j8z96R_l4s%O#8~El1(}DhX4V1*IomWfe!BuhGx?D>QN8rfR^Pnhg zbBP3xT^_aO`QvmfuWQ-Ya;pWEJC^DN6oQ3~l(q@c;GJiHBpE+vji!SUrS$H`&yyH& zUd+$yK3kshXHAF+oC6M2D#p;Y<2E%a9vKHT#dXKnW#~8mq%PbjsJH4HGSm6W?lDXMfaidjs{veA~lZn^JNy!Z4Qi-rO8?!<6n=n&#S;%jD|X3RGnqHGD@g1R zgEGI#@6TB_N;>1Bi8-vIWjASC7@%hO<+8er`BF!tXfv~7GoTo~Y`J~$EwgV-eem7( zQlmPhiVIi|WDFbqiOTj9J#%GXX0vSA8pl_*x>}vr0X)VrGMgJms~Wk55jZ&cVP;fu z=5@Hc*6pORKwa5dA25jZh)i;l+n9CQ0z=Trt7e*M{zPx;ZN{|f>r6b3YYBT|+TIx} z2arWs39eq$LE??812cw0L{cs`9IK3kV5D-3=(EG(E~pPCCL^8xPZRInC9Hlah4#ngBl`Teyzn9T&Z$dDWzC;<5!kaN%JnE$A!ebZkDO$!o1xwQ|e#H@vteeVlx!W>j6+-1(sKi`Bm9F zxD%z#-K;g=EWR}AjCI7n5mvZ5_O8+*!erD_7@*hdmm-7I93O19bA@-7a(yH{G47BdHq~V00eGBY3UzprMR);hR|=q$I1?f+}&H!m;#t;U8_oa zDbRb%&j1U?0mm{Z4Jb&C9Ri(|J)c-^N3oPx*B>ofq~p&-A?ey3SEq~rsj;Z=sIV4) z@fnH~r={iDU3YqvXlNEUdVN}w%acbNk{A3gtlu$)SFm4P_s*r0Wj>6tWUi7Y6EIxX z%u&jV8EJ@g*{~Y>{t;#dSzIog>I30Pv?1fw<$aHO=$XHkUf7r$=~x-T?~(b1_1eo7 z{BXbH4gIaQjbu$wdqR87B{>Fo)IG33J#_Z+%BTKM`QNOd8NpPyAV(MbfWg?~ElN}I%UIOcZQFJ+$lAOrt&2Mc%3tJoh&@MV2O zt`ZE^8v9zu2!ie9VVOyRzBSR!`(@d{__>&%9>a5~sihbw*>9;!a8M^sP@RyL5LNs5 zJP*=O)WaGTvQ!Io9l%j*l*p3AnWvq(!^t2fhud!dKpi7UKaithivnQX*|UmF98uc( zHk|MVE))SN|Iv!N-ZBafu7 zg1w_u%KZVcoxyAKm#D8M7>|l~<08O+S*vcam-BfuWJkP`WfA6yk>FfRA)4c?1Su=* z%ihZqp5>*hV|fa59q$bU@+5IfhSAoL<`9pS<@L=E_oY1}_+N)mNe1;xcK^I73mU=5 z2m2Ci557lnra8j)LsQsWWcK$$*^_SC9DEXzDxcwcF(d;lv+km4)vg&Ng^r+o#{WFq zP>TGr%i63-@)U_BYzjbdcbr#m%#@h$!{W}PbXQ8N*UI2ni}k$UDhN(@OotXD_YE`B z&2E2>{t&0LP(#Jv*vAkA!Zybu=>4mPx)NXAA;QJb)EdxV@98NkBXiE2p7|p5l$yv57JUZ=uo6F~p5E`zw|4p_Y6`Zij^^&?NH1N! zW|BJY9Zj6Y&{~Iib;Wqa5Vl2!gzCt`$n)=#jix|=jfa#*JB0mDd3YB^tH~H(H}wLi zk}0^5*zvB-g4=r@}K7E0<`j*s!njRjE0 zLu{#vUql}SpU4HR{C4SZ)m30WR$?iU5!!6~UF{sdKUB~^M6oeLT{q*X*mS~}`hRq8 zPoV6urw5V@4HUSJ!xj9@92TaD{?>xP23@Gq?;NIQFuO>X{h_`2<%%RTCKf6Bc}F}N>&x)lATitySE?wJtJhGmSj3A2U-c32MUBDLDKVY-n?qp74H%F(&yp*aH|6K?3fYwOz!>H!yxVmISnDM5wM} zU@9=53^Cu4Ji1yUj=2;6`Oe-a00oPSC-n+*q5o}q{Qv#6y_;xn0Yn?|j``Zq5ZQ7D z+$5Z3<>X4iTBoPVVL|14oMdg?hm)26d_!+{s2!$GBQA0O`H>rTE~x8IPEF;RbtWJt zIVwLm-S-^GI3ESayKfDFi>)FFubFZI+hY;d|Lg*Lz`(y>8-D69{vF}*f$`4}*MP1u z_~+bz^6JK~J7BpG96%{N4y`6*n1tT^d6cYg@ZC%IKiyCx(gKW!0YZWAhbRLNa~vF; zKRz@Qb?$FCBtq6If3+~J)NYTL02g%_xG37+5C8j{A)W945)iy`(HopD8~1)M7(+v{ z7~BL&%wE$>5JKl1gj|vw9Oq++a&n-lHIpr{?4VoXKcZ5n@IJoZxeDJ;%?uKjxlY znLccZOq2e6Q*Y3OB<78Liy_a5_W+gP_Dh34Os$AjEl)f6En5SV zaa$;(ZE251r)&@yP&SBDBhVyuiXTcBM%6a%R0H$~R4cUA)YP&%S|1`|VN23$n_i#N z8-yc?rDEPY7Te8v(JPDskolsoL_BN_f_-`6Vz+W6dfaw5Wr>!EZbx~X;} z*jQUThD>!K>b{<1MD-Eyr|lS%v+HNc5dLPYJup>Qe}Ku!5IEjUzQ_W)*&yKeWgKdc zjb;iFN6C>O3BJQ0d+-Kog6S?o^dFO54#VShTGkvuJ=ot`aYAUDz!J6wbyJNZBT1f` zffY63#3v0`5l5nmS`9GJ5YhPaR*D$aNI3U7sRc9|z9MaY3tq7L&lczez+mP%ACYks z((|AB9z66fr62Mvf&zqa%HVV7XlJUUi;i91GdjzwJ`Vew-@Uyl^MjrBe((d?-T;#B z3c`m%H`B&QhpTF*N|c=fPscH5ID{kJX|j6v+YlP1D{VH=LE8e@@C(3GYgM}s2O&jS zk5%=VwnabKT!QIqn;g%px>l~amnziNj!Ux=d+Kg(HB%{^`Kv z&M*vI(y=LUb1AVmUAV;0HmyGo0%iN^$*_7kqn$F9YIiu&LO>6vQGqIqiaz7H5zuFy zMy$&#xugnJ*FDoLWTHdG6?`H{>|D&>Qj5P@R=zLS!J6hP^IG-P$<7P+dzT=>Tg#Nh zahL_7ZN`S>m-;-bC;geMeoZ&$N5+tnOI_~enw)=$x{YIkFdJ!R)*C6;NUxH+{rni% zjDL^(0q8-ORqL__f!#LWwl)MI@+notJ80c{)*&$6Q{u$}&$cuO)2^%Hr_*}Ra{Ttl z+0H4J*-*ndaVVU2NpE_O?i12bf0{^{^DpaeCQ<}wwuCOz7sH~t?PujwuiZ+!DnlEl z(~hes*F(v^_gVh*-e&X*YCqfT`}EYgwi9(uh!%BT`S_wugsX`0&D!-C%lziN*e!&K zUs*COrdpN0_Pqn?e05pow05Okn*07r`Krsr zJaB5=LEAsS>>asf_7HuStnmty+VvISdq2jCEw}NrgNqh%)Du{j)nl*iZB5oMsEi0? z{GqgvwV0h1TtOcqzaPbcfMtG`OK_yUYC$uwX7e%d#C0dTeU>F6qI~iz2>4OFPDWH8 zRX!!_dJ?Q)Be}&9iq;}aC`Cx>3XhGh(#hsLrh$C&&k;=bJ67Uy_aYN?_@Z%te&MX$ zZsC#xYWA1XpF+YK|BhH%SRla%bLFhpd!MI2oVwH&^|BzEo*~wM1~@QZSDz8hMCHg) zM}0ZkUt2r+t|Ik1^qT|ldq-BvClUneMjt^6Q5mbV5JT{0`cpThY_=+5<2GAO%r`du zv}ZG$DAhqZAkXMXCP-~BNag?O?#%z8Uf(~ip=Iin&LHQ=nz750RAWo1n6YIoTcpJj zGNo)Kq7Jf;F!p7J8B8156DeCLW+ZVi7&G_0@B6;4 z*L7XbR~p}5r}xk8O5AN@7?xz3OJn0|#;SJ&XDe=qf2h{DMaf8)xeLa{4r0YNH|@@y zVvFa_UQbZ zJ(eB%%8zG`CfhirX?hJNoh#?oM<2D*WtvR0g&Go_M>qv=m86?Fip<3KcdOTkZIF5T^_mhRlboceUF;on5z|(DLAbaz#)B@ zxdk17;qa>NaVfXUtj9_>y|J`V4hveDa`cAGm4gGT(8$VRNG@KEWMkFR+3CA^1jC+;5UYWTm$oF#_yL@Vh5lxk( zMl0!$93@sa`iR#ja>$kOuT6ek^N4cxN}ZS;OF(^MqYr!MSC|I}p8)r8S;#Ce ztsiXCddp|7Wikmy{awp9`}}6oup5(j$)>KamJ6!C@8?lG*5 zsw#0|1ZfHF$-5s`xBseS*1Rb${_59iNvB^Xeb6_356)-_CFVWP?c=JlBKt&$NhJ4K z&j#L}c(vrUL5yd9bA>zOE$sEZb|OD$E)GVT0q66$dLY(wUn)8|5=*q!?oebj=s0Y`Jrh9WzaJP%WL4||+?e^%U>ffSXVAU@}^A=Aq3 zaf>kO!EYM^HE|-s4*IlVA^$Id7klZ8w>U)5YAy?zgA#VbJ`{rVR$Bp|*|h9{5_(16 zUgFp85Z`LjYsNhIL@w6jJj6N|a%qHU)6J!$lJsrayku+JfGwgEP6l*#TCwEJ+J)xb zC8eCWy^q>oR7D@p{P`u){dpY))g62A$K6w8r2(UuhLzek6hihP8U%&}t$yG-LdHPi798 z$7<4YF;o5Aoqh$Dgoi1WE84!vP?Wf;hKMoWGcr`>q4#iIUkTVNZENl|0H?6<0!JlP zXnLsaN~?v}+u7Qa$77!4@I}$%BrO^IcB3EjIr*@y3!Ta&RYDdM#v@bM^-)obgacGg zcGBbV5Pm0O9PM(JMgffAs)S++`q|_p`=?^zC7k&zBkd#NVbI(}PWf@?cu2~n(LZF* zF@mx3qz&>ViNRWJp82H9WpDdeFi~e)$he^-j(ws2c)czw9^%81r|UzBn+uOd+P4f{ zeVxh4jum&gIyzBMK99|?Ut``|vG=W|li_OtV|J5w>!BEri*HsZ3$rPE`6+nS= zP&MH#>8Yul5)a*y{nt=wdNIW}cr(SGa6M)xusgRAPvU8$aD!d3J;&_unI=7Azv-#@ z!SP8n6_1yusgQ`XbX*uOUVa5kE^ShESpm$~htX(5;*G7p@2M**tc;GPaIeDVQjO`e zPA6CBu9${V+DCba6-_6^kLk_Rwk2uJ(*$$sR9N*hS}*EaOVHsQMktJ(VPFj0C z>!z{&PO{^x?VChV#5-oLkv|Xiavum&rPJze8OCGGypSY_y--_pSUuZ z#}7Ma+DWpEm4&@hRdy~Pl$-B_5I%LMfk>uausNL z*^O_A(>BbU7oZ%L-pb;7O*%qTiQf$4+NF1+OLJCfofI8ac`SF+pVUD`GQqE1rQaq_ zIM3K6fAr2GH3DxF*KDvne`c`y9Bv3(yQuW1VApK--GD-s)OrY!iOp;aQTb#K>8--A z0OcL54)#w;2`}}U{EyjPt*P3fxVzN|KR5h#r&p9WZV2 zDkIiAUvbSanVPeb;TU?%HxG_WjR5Lu2HxG6MTuQiy`-0CU!{*96J^Sm+ioxr6fZ1! z$S^CNoGH0#l9g1a6k>9S6|Je2;!@}n;W(2r^hd{W7dBBvgGj}9WM@nVbL5Sq$Sb_u z0hYnE+qL)A=U9<(Pg}S~XDWkL(ZTps9&L&by}8IkmxF~HVc%jTY1U1x(?;J|+O9oW z@whUqHrbUB1u^)@e=*3`Z3!z6)e+xD&zvm@CdX8R^UC6|%xv!A2z zlQXrho3d(GiB}to#pdsD+y@4mN4n}A1vv-&$GCPub5n!a?dFq&U8#KR3she)sV!t| zOXd@~1c{37-T`{7Dq$oHFJNAC4cExreyFzR^?gTMI;B%2h9DI|52r3Ul^X_;=G47L z%`6YJEtcswp1$Ue0856%J@sc_&-95o5R{o%g|N8M_(dum+C%$7?diSZH72{# zOZJ5Z5ZMQLh@mjjZ)SYIy<6;yo*J`zKsIi>ZalT3n;Ey`SY?- z6?uF&M{V5o(LkEgJ$?eFqB8o4_^(HcpPFgw@49_0K{DI-B&G|Z#oj8+oBr1JNj$r` z8&Z@RU#C+WWx_o>Y2)?A%;+;Jw55;K=qnS8M7ErEl|@?+Bc-;Qek9wRp(_pY40?1I(YPA0DT`DGU#+C$ zVIf(-u5y=UrIq%Zt<&_re<>Bv;L*u3+W>VUvJJ7)zd%?XIzdl z$$~2gT1xZca@s`(ET%Zo`|ut!VMAl@^6SJXXO?e;4G}KHJ~gPl-D5`t(#x@rNo%Qo z5$j17zKE}-gKisbxSC@Z<(jWoOPH3Xp+cxGxN8}HBPd-b21%JM8`P3a_KDq*GW&AU zHCCHot0Kmbs(g-eu@ROaOQ`ATGyRxAc}!h4h}<)7ZhN5Wr8-7v;HIf4TZqnK+>4@f z^8F?q${1lt@+Sp5tys^F^Q#>?mPOXt-I80J=Mxw#9|GHkETDS1NBcVSagkgZ3;S4a zg=A{nd$fw%yllfW!q_Y)ZXTPZm%~N)_cdwYGLCV->LaBmQfa2uO1n(H zX0cTP*NJ`Ytz2F!5M_dmQW=$$(Dg}`2k)*gCfe+1!;s#58xM@oRoj4?!EC-u#6+no zb1zBr?Dp!h+|G<`Y4{p*(G8)Q#0Mj?KJwe<&^+$`bw#zFAEp11cE2YIT7O*cu_xvf z9>0Gos{6D^kwKIkAWwCIscKhX-1Wb~`|U6O%~V4ZokDDqdgQ{Awl2R!V=-)1GaQwE ztW z%X2};PWr)$f0UQM*Fyn&@9Oep#C#t5Ia*j-REJV7_Q+ix?>>W|^HO~VljfMFMj;{RLz6&kQzr*CZ7Dt(fz_EHISF8E%M-TC9)ioA4_d>{yVZ=oXai7k&U0Q9b9 zpTA}16TtNQ3~WEy?Z%*e63|s30)H;*FqHY?E4N8@9c~jV#w-RZQ6)V@g1v-&Q3g`s z+HCQkM*HvO3uxqv(rimpA4g75u1ZitIv<=N%wGgSso1&WIams~ms2x-1C%@h<^u56 zZaaeMPhnYA0|&-erqyHLgK;8nO>=|G@0{`$67>3Jd`# zv{-^otm+i#t>-~Q^s1ihDeEu<5B21|&ancQUpuUCWr(jwq%g!4P36kOdHB%H(`oR3 zD(+`zWo_n<+p^gR4oi)Dpaoa#gj~-Fjv3jN7Ml}iVP7`=qR#d6GZ=P6SXK2cpbNB;yH~d$vW&h&>|7i4o?{eXUCr~Q+;F8h{GyM$5A#gc= z_Nm^LjcxM>q!+*eq0T9vf31)~LNK`yAV$gDp4@qOckEY0ii0m$4Y96Ig{|DWNL40q zlSC%~J$vQp*cIGEJMRzc6X?EmzaA|mdw_M9jpp1e81}vyew96r_{&4lq4ZDr#|x(r z{TEt`YFh3?2$euA0~_%-Bq`IWlq+B_jkse&d^JW)PpFv8yd_8+GKQjpWxzzF>O@1w zvl5@_v+#|->*2@Kc($~|*mJz{mQ;du6+Bdj!;P*c^*pYI`*L9Dl}?+B$;BzLTnVe4uY&LREPcz*}N8Uz8m0Ll~- zBQWTgtnT+HpSLUTnCN_>j!I}0m$<$Hg$~be%c)bRz(;$=&&DQY?54CQVM6t;3|*(B zyV!M7aU875ZTSWRff9jlGXp&Ht=1Elhcjy9|uTmRLt>m0S54uzVM9k>;4 z{8N@UM%SbA&m%}tEBr;m&Ze6XA-=hA&f=%TSu+#eAVQsKg*^Q2izKD-M90Yc(5Cgq z6(HUS7?K60iI!QYDKbM4yR6n_0?O9>G_^Z)6E~6qY!vi(Al2)~8 z{ULAF_t)7A;AnK;wz}2?WBt&T{ux}^i(x5nnZ{$W$Ho4{{|V<$S&>EA+e=j@hGO1V zLF_ZUWGa9QJqMPFr`9&3&k!S|Aiy3Jq3sgp<;A31td)$>@Z zdMd5%NZHoVFUUNH;830P{}z`s__=x?yfe`t9r+}eQ36xn|PzwZ?ejLoOs*E(PtV;g}oDh zZWQ!>B3JQwPyO)%A-~Ux6<=z%ToT-14f~R=S!7jemW*Hj5#WI%>qL7 zvYd?aFAIQ4PEmMxZPj;(82gZBl&%rN*Y-7Qqmchg%AuwY<|XdEJ-N#DNI(sYGsea6 zMhEeEa&UH*PQFE2Gz_$ECxObPTCOQ)MUDrf%lY?dD2h6gPg5_~-{hHHS3`KO{ zHy`9o`WSVL^igV!xP|U;qus%)uuen1NU)`_-pkNVw=OO*H_tr(4i`Fnj7_%5=Ym@! z^aRAGh=~4h>HPg`kQSgrFSe<7K~0V{xj@fGd_g1{xFrz(thmxoGAJKMh`MS+U5GWb z*4I$5oYbkRbV(XeuGTp~(P{D=9K&~j$pMw@0mT%e7r1C4>OS_GUM1eZz^g=8my{^Z zOYU((V4wN9RPuSNiqSm1;AzggnK#-`#9PsU0|#Zxw-Lu^CSnPA8D27fAaR5A4mq3Q zcKqG4LB0woXwQ|oPp;|3iQ{k%*`#^zNL)Tp#6t>6SHU!^Xm3`)spPREE7fUbV)B@cnM)Xkv>OeOkT@&<-M)ae6&v0Z!Gr^}4Jcy!$S+KLP1?P*i~22_u($P| znDH8(G+awYww>)buB@YIpQo`jt=+_*E=4=`GnM$UHmtG4c5vC;9~be&y0=ekvPv zB889e={yu|S*KiIX4D9lgodX0+p(`hpD#Z;t((Kt=s#;(PB%GdKKmesqZ%BWH&jUs9jb;;CwcdW zge^1|Q;G~*D8A0KqpXXyvF(^?`dy12Wv6Qtho&S-y7Dc4SJ2y=uY7%$D)Vxlb|bfL z5BdV;Yrj@U_p`7+^pTDyAI9gOT~?9MQFs$XSV3&@BL91D{MClD3+6_xPN&IK{4mY* zLr?-#(z+sH%`>_`-Ns0i23, 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "00:15:20 INFO - pipeline id pipeline_id\n", - "00:15:20 INFO - code location None\n", - "00:15:20 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_text_out\n", - "00:15:20 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:20 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "00:15:20 INFO - orchestrator pdf2parquet started at 2025-01-21 00:15:20\n", - "00:15:20 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "00:15:20 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 129720.74it/s]\n", - "00:15:24 INFO - Completed 1 files (16.67%) in 0.017 min\n", - "00:15:25 INFO - Completed 2 files (33.33%) in 0.032 min\n", - "00:15:26 INFO - Completed 3 files (50.0%) in 0.044 min\n", - "00:15:27 INFO - Completed 4 files (66.67%) in 0.054 min\n", - "00:15:27 INFO - Completed 5 files (83.33%) in 0.064 min\n", - "00:15:28 INFO - Completed 6 files (100.0%) in 0.075 min\n", - "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.127 min, execution result 0\n" + "23:43:57 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "23:43:57 INFO - pipeline id pipeline_id\n", + "23:43:57 INFO - code location None\n", + "23:43:57 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_text_out\n", + "23:43:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:43:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "23:43:57 INFO - orchestrator pdf2parquet started at 2025-01-21 23:43:57\n", + "23:43:57 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "23:43:57 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 112347.43it/s]\n", + "23:44:01 INFO - Completed 1 files (16.67%) in 0.017 min\n", + "23:44:02 INFO - Completed 2 files (33.33%) in 0.031 min\n", + "23:44:03 INFO - Completed 3 files (50.0%) in 0.042 min\n", + "23:44:04 INFO - Completed 4 files (66.67%) in 0.052 min\n", + "23:44:04 INFO - Completed 5 files (83.33%) in 0.063 min\n", + "23:44:05 INFO - Completed 6 files (100.0%) in 0.073 min\n", + "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.126 min, execution result 0\n" ] }, { @@ -417,7 +453,7 @@ "output_type": "stream", "text": [ "✅ Stage:1 completed successfully\n", - "CPU times: user 21 s, sys: 1.65 s, total: 22.7 s\n", + "CPU times: user 20.3 s, sys: 1.99 s, total: 22.3 s\n", "Wall time: 10.3 s\n" ] } @@ -429,9 +465,9 @@ "from dpk_pdf2parquet.transform import pdf2parquet_contents_types\n", "\n", "STAGE = 1 \n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{MY_CONFIG.INPUT_DATA_DIR}' --> output='{output_text_dir}'\\n\", flush=True)\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_dir}' --> output='{output_text_dir}'\\n\", flush=True)\n", "\n", - "result = Pdf2Parquet(input_folder= MY_CONFIG.INPUT_DATA_DIR,\n", + "result = Pdf2Parquet(input_folder= input_dir,\n", " output_folder= output_text_dir,\n", " data_files_to_use=['.pdf'],\n", " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", @@ -458,7 +494,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "fe59563d", "metadata": { "colab": { @@ -515,126 +551,126 @@ " \n", " \n", " 0\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-21T23:44:04.067075\n", + " 0.636751\n", + " lorem-ipsum.pdf\n", + " \n", + " \n", + " 1\n", " spam.pdf\n", " Free xxx\n", " 1\n", " 0\n", " 2\n", - " 5a9d562d-ba87-4b2f-954d-fd9e7aece509\n", + " 518a2e39-5c85-400f-8864-6bbc3ef20b1e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-21T00:15:28.419184\n", - " 0.660825\n", + " 2025-01-21T23:44:05.320766\n", + " 0.619056\n", " spam.pdf\n", " \n", " \n", - " 1\n", + " 2\n", " earth2.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 6fdf34dd-3e36-4311-9d85-eaa2c22146dd\n", + " c9accf02-d2ed-4307-b0c4-53a3a3699179\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-21T00:15:26.512279\n", - " 0.691329\n", + " 2025-01-21T23:44:03.428640\n", + " 0.620741\n", " earth2.pdf\n", " \n", " \n", - " 2\n", + " 3\n", " mars.pdf\n", " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18\n", + " 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-21T00:15:27.756719\n", - " 0.615258\n", + " 2025-01-21T23:44:04.700038\n", + " 0.629441\n", " mars.pdf\n", " \n", " \n", - " 3\n", + " 4\n", " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6\n", + " b895a249-e72d-4096-85fa-e0606d61aebf\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-21T00:15:24.915376\n", - " 1.019706\n", + " 2025-01-21T23:44:01.917104\n", + " 0.993879\n", " earth-copy.pdf\n", " \n", - " \n", - " 4\n", - " lorem.pdf\n", - " Lorem ipsum Lorem ipsum Lorem ipsum\n", - " 1\n", - " 0\n", - " 2\n", - " 8a86db6e-0ac6-480c-9a86-d4ac383e2589\n", - " 6571294142213095721\n", - " pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 35\n", - " 2025-01-21T00:15:27.139779\n", - " 0.625818\n", - " lorem.pdf\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " filename contents \\\n", - "0 spam.pdf Free xxx \n", - "1 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "4 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", - "1 1 0 11 \n", + "1 1 0 2 \n", "2 1 0 11 \n", "3 1 0 11 \n", - "4 1 0 2 \n", + "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", - "1 6fdf34dd-3e36-4311-9d85-eaa2c22146dd 10729312978404042321 pdf \n", - "2 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", - "3 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", - "4 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", + "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", + "2 c9accf02-d2ed-4307-b0c4-53a3a3699179 10729312978404042321 pdf \n", + "3 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", + "4 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", "\n", " hash size \\\n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", - "1 2025-01-21T00:15:26.512279 0.691329 earth2.pdf \n", - "2 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", - "3 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", - "4 2025-01-21T00:15:27.139779 0.625818 lorem.pdf " + " date_acquired pdf_convert_time source_filename \n", + "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", + "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", + "2 2025-01-21T23:44:03.428640 0.620741 earth2.pdf \n", + "3 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", + "4 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -675,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "f870e624", "metadata": { "colab": { @@ -689,7 +725,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Free xxx\n" + "Lorem ipsum Lorem ipsum Lorem ipsum\n" ] } ], @@ -699,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "e1a10c2d", "metadata": { "colab": { @@ -713,23 +749,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "## Earth\n", - "\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "\n", - "For more details about the Solar system see Chapter 1.\n", - "\n", - "## Earth\n", - "\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "\n", - "Basic facts about Earth:\n", - "\n", - "- · Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "- · Moons: One moon, called Luna or simply \"the Moon\".\n", - "- · Rotation Period: 24 hours (one day)\n" + "Free xxx\n" ] } ], @@ -764,7 +784,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "cee20521", "metadata": {}, "outputs": [ @@ -780,23 +800,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "00:15:28 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/01_text_out output_folder - output/02_docid_out\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator doc_id started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.049612998962402344}\n", - "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n" + "23:44:05 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/01_text_out output_folder - output/02_docid_out\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator doc_id started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -804,8 +824,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 20.5 ms, sys: 2.75 ms, total: 23.2 ms\n", - "Wall time: 18.6 ms\n" + "CPU times: user 15 ms, sys: 8.25 ms, total: 23.3 ms\n", + "Wall time: 18.9 ms\n" ] } ], @@ -844,7 +864,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "f3d4aba9", "metadata": {}, "outputs": [ @@ -896,143 +916,143 @@ " \n", " \n", " 0\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-21T23:44:04.067075\n", + " 0.636751\n", + " lorem-ipsum.pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 3\n", + " \n", + " \n", + " 1\n", " spam.pdf\n", " Free xxx\n", " 1\n", " 0\n", " 2\n", - " 5a9d562d-ba87-4b2f-954d-fd9e7aece509\n", + " 518a2e39-5c85-400f-8864-6bbc3ef20b1e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-21T00:15:28.419184\n", - " 0.660825\n", + " 2025-01-21T23:44:05.320766\n", + " 0.619056\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", " \n", " \n", - " 1\n", + " 2\n", " earth2.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 6fdf34dd-3e36-4311-9d85-eaa2c22146dd\n", + " c9accf02-d2ed-4307-b0c4-53a3a3699179\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-21T00:15:26.512279\n", - " 0.691329\n", + " 2025-01-21T23:44:03.428640\n", + " 0.620741\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", " \n", " \n", - " 2\n", + " 3\n", " mars.pdf\n", " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18\n", + " 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-21T00:15:27.756719\n", - " 0.615258\n", + " 2025-01-21T23:44:04.700038\n", + " 0.629441\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", " \n", " \n", - " 3\n", + " 4\n", " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6\n", + " b895a249-e72d-4096-85fa-e0606d61aebf\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-21T00:15:24.915376\n", - " 1.019706\n", + " 2025-01-21T23:44:01.917104\n", + " 0.993879\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", " \n", - " \n", - " 4\n", - " lorem.pdf\n", - " Lorem ipsum Lorem ipsum Lorem ipsum\n", - " 1\n", - " 0\n", - " 2\n", - " 8a86db6e-0ac6-480c-9a86-d4ac383e2589\n", - " 6571294142213095721\n", - " pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 35\n", - " 2025-01-21T00:15:27.139779\n", - " 0.625818\n", - " lorem.pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 3\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " filename contents \\\n", - "0 spam.pdf Free xxx \n", - "1 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "4 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", - "1 1 0 11 \n", + "1 1 0 2 \n", "2 1 0 11 \n", "3 1 0 11 \n", - "4 1 0 2 \n", + "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", - "1 6fdf34dd-3e36-4311-9d85-eaa2c22146dd 10729312978404042321 pdf \n", - "2 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", - "3 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", - "4 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", + "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", + "2 c9accf02-d2ed-4307-b0c4-53a3a3699179 10729312978404042321 pdf \n", + "3 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", + "4 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", "\n", " hash size \\\n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", - "1 2025-01-21T00:15:26.512279 0.691329 earth2.pdf \n", - "2 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", - "3 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", - "4 2025-01-21T00:15:27.139779 0.625818 lorem.pdf \n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", + "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", + "2 2025-01-21T23:44:03.428640 0.620741 earth2.pdf \n", + "3 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", + "4 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf \n", "\n", " doc_hash int_id_column \n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", - "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", - "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 " + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 " ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1067,7 +1087,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "90eddb4c", "metadata": {}, "outputs": [ @@ -1083,24 +1103,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "00:15:28 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator ededup started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.05621051788330078}\n", - "00:15:28 INFO - Starting from the beginning\n", - "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n" + "23:44:05 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator ededup started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "23:44:05 INFO - Starting from the beginning\n", + "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1108,8 +1128,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 25.1 ms, sys: 2.26 ms, total: 27.4 ms\n", - "Wall time: 22 ms\n" + "CPU times: user 20.5 ms, sys: 5.55 ms, total: 26.1 ms\n", + "Wall time: 20.3 ms\n" ] } ], @@ -1145,7 +1165,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "1887b26d", "metadata": {}, "outputs": [ @@ -1201,148 +1221,148 @@ " \n", " \n", " 0\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-21T23:44:04.067075\n", + " 0.636751\n", + " lorem-ipsum.pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 3\n", + " []\n", + " \n", + " \n", + " 1\n", " spam.pdf\n", " Free xxx\n", " 1\n", " 0\n", " 2\n", - " 5a9d562d-ba87-4b2f-954d-fd9e7aece509\n", + " 518a2e39-5c85-400f-8864-6bbc3ef20b1e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-21T00:15:28.419184\n", - " 0.660825\n", + " 2025-01-21T23:44:05.320766\n", + " 0.619056\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", " []\n", " \n", " \n", - " 1\n", + " 2\n", " earth2.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 6fdf34dd-3e36-4311-9d85-eaa2c22146dd\n", + " c9accf02-d2ed-4307-b0c4-53a3a3699179\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-21T00:15:26.512279\n", - " 0.691329\n", + " 2025-01-21T23:44:03.428640\n", + " 0.620741\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", " []\n", " \n", " \n", - " 2\n", + " 3\n", " mars.pdf\n", " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18\n", + " 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-21T00:15:27.756719\n", - " 0.615258\n", + " 2025-01-21T23:44:04.700038\n", + " 0.629441\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", " []\n", " \n", " \n", - " 3\n", + " 4\n", " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6\n", + " b895a249-e72d-4096-85fa-e0606d61aebf\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-21T00:15:24.915376\n", - " 1.019706\n", + " 2025-01-21T23:44:01.917104\n", + " 0.993879\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", " []\n", " \n", - " \n", - " 4\n", - " lorem.pdf\n", - " Lorem ipsum Lorem ipsum Lorem ipsum\n", - " 1\n", - " 0\n", - " 2\n", - " 8a86db6e-0ac6-480c-9a86-d4ac383e2589\n", - " 6571294142213095721\n", - " pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 35\n", - " 2025-01-21T00:15:27.139779\n", - " 0.625818\n", - " lorem.pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 3\n", - " []\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " filename contents \\\n", - "0 spam.pdf Free xxx \n", - "1 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "4 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", - "1 1 0 11 \n", + "1 1 0 2 \n", "2 1 0 11 \n", "3 1 0 11 \n", - "4 1 0 2 \n", + "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", - "1 6fdf34dd-3e36-4311-9d85-eaa2c22146dd 10729312978404042321 pdf \n", - "2 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", - "3 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", - "4 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", + "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", + "2 c9accf02-d2ed-4307-b0c4-53a3a3699179 10729312978404042321 pdf \n", + "3 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", + "4 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", "\n", " hash size \\\n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", - "1 2025-01-21T00:15:26.512279 0.691329 earth2.pdf \n", - "2 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", - "3 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", - "4 2025-01-21T00:15:27.139779 0.625818 lorem.pdf \n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", + "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", + "2 2025-01-21T23:44:03.428640 0.620741 earth2.pdf \n", + "3 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", + "4 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", - "1 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] \n", - "4 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] " + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1393,7 +1413,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "37430b60", "metadata": {}, "outputs": [ @@ -1409,109 +1429,109 @@ "name": "stderr", "output_type": "stream", "text": [ - "00:15:28 INFO - Starting SignatureCalculation step\n", - "00:15:28 INFO - Got parameters for SignatureCalculation\n", - "00:15:28 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "00:15:28 INFO - data factory scdata_ is using local configuration without input/output path\n", - "00:15:28 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator minhash started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05067157745361328}\n", - "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "00:15:28 WARNING - table is empty, skipping processing\n", - "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", - "00:15:28 INFO - Starting flush()\n", - "00:15:28 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", - "00:15:28 INFO - done flushing in 0.02 sec\n", - "00:15:28 INFO - Completed execution in 0.001 min, execution result 0\n", - "00:15:28 INFO - SignatureCalculation completed successfully\n", - "00:15:28 INFO - Starting ClusterAnalysis step\n", - "00:15:28 INFO - Got parameters for ClusterAnalysis\n", - "00:15:28 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator cluster started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of folders is 14\n", - "00:15:28 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "00:15:28 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "00:15:28 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "00:15:28 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "00:15:28 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "00:15:28 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "00:15:28 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 8 files (57.14%) in 0.0 min\n", - "00:15:28 INFO - Completed 9 files (64.29%) in 0.0 min\n", - "00:15:28 INFO - Completed 10 files (71.43%) in 0.0 min\n", - "00:15:28 INFO - Completed 11 files (78.57%) in 0.0 min\n", - "00:15:28 INFO - Completed 12 files (85.71%) in 0.0 min\n", - "00:15:28 INFO - Completed 13 files (92.86%) in 0.0 min\n", - "00:15:28 INFO - Completed 14 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 14 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n", - "00:15:28 INFO - ClusterAnalysis completed successfully\n", - "00:15:28 INFO - Starting GetDuplicateList step\n", - "00:15:28 INFO - Got parameters for GetDuplicateList\n", - "00:15:28 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator fdlist started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of folders is 1\n", - "00:15:28 INFO - Get Duplicate List for folder docs_to_remove\n", - "00:15:28 INFO - 1 documents marked as duplicates\n", - "00:15:28 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 1 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n", - "00:15:28 INFO - GetDuplicateList completed successfully\n", - "00:15:28 INFO - Starting DataCleaning step\n", - "00:15:28 INFO - Got parameters for DataCleaning\n", - "00:15:28 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "00:15:28 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "00:15:28 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator fdclean started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05067157745361328}\n", - "00:15:28 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "00:15:28 WARNING - table is empty, skipping processing\n", - "00:15:28 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "00:15:28 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "00:15:28 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 6 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n", - "00:15:28 INFO - DataCleaning completed successfully\n" + "23:44:05 INFO - Starting SignatureCalculation step\n", + "23:44:05 INFO - Got parameters for SignatureCalculation\n", + "23:44:05 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "23:44:05 INFO - data factory scdata_ is using local configuration without input/output path\n", + "23:44:05 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator minhash started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:44:05 WARNING - table is empty, skipping processing\n", + "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:44:05 INFO - Starting flush()\n", + "23:44:05 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "23:44:05 INFO - done flushing in 0.021 sec\n", + "23:44:05 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:44:05 INFO - SignatureCalculation completed successfully\n", + "23:44:05 INFO - Starting ClusterAnalysis step\n", + "23:44:05 INFO - Got parameters for ClusterAnalysis\n", + "23:44:05 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator cluster started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of folders is 14\n", + "23:44:05 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "23:44:05 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "23:44:05 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "23:44:05 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "23:44:05 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "23:44:05 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "23:44:05 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 8 files (57.14%) in 0.0 min\n", + "23:44:05 INFO - Completed 9 files (64.29%) in 0.0 min\n", + "23:44:05 INFO - Completed 10 files (71.43%) in 0.0 min\n", + "23:44:05 INFO - Completed 11 files (78.57%) in 0.0 min\n", + "23:44:05 INFO - Completed 12 files (85.71%) in 0.0 min\n", + "23:44:05 INFO - Completed 13 files (92.86%) in 0.0 min\n", + "23:44:05 INFO - Completed 14 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 14 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n", + "23:44:05 INFO - ClusterAnalysis completed successfully\n", + "23:44:05 INFO - Starting GetDuplicateList step\n", + "23:44:05 INFO - Got parameters for GetDuplicateList\n", + "23:44:05 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator fdlist started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of folders is 1\n", + "23:44:05 INFO - Get Duplicate List for folder docs_to_remove\n", + "23:44:05 INFO - 1 documents marked as duplicates\n", + "23:44:05 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 1 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n", + "23:44:05 INFO - GetDuplicateList completed successfully\n", + "23:44:05 INFO - Starting DataCleaning step\n", + "23:44:05 INFO - Got parameters for DataCleaning\n", + "23:44:05 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "23:44:05 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "23:44:05 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator fdclean started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:44:05 WARNING - table is empty, skipping processing\n", + "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n", + "23:44:05 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 301 ms, sys: 71.6 ms, total: 373 ms\n", - "Wall time: 265 ms\n" + "CPU times: user 311 ms, sys: 72.2 ms, total: 383 ms\n", + "Wall time: 271 ms\n" ] } ], @@ -1555,7 +1575,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "573faba2", "metadata": {}, "outputs": [ @@ -1611,123 +1631,123 @@ " \n", " \n", " 0\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-21T23:44:04.067075\n", + " 0.636751\n", + " lorem-ipsum.pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 3\n", + " []\n", + " \n", + " \n", + " 1\n", " spam.pdf\n", " Free xxx\n", " 1\n", " 0\n", " 2\n", - " 5a9d562d-ba87-4b2f-954d-fd9e7aece509\n", + " 518a2e39-5c85-400f-8864-6bbc3ef20b1e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-21T00:15:28.419184\n", - " 0.660825\n", + " 2025-01-21T23:44:05.320766\n", + " 0.619056\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", " []\n", " \n", " \n", - " 1\n", + " 2\n", " mars.pdf\n", " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18\n", + " 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-21T00:15:27.756719\n", - " 0.615258\n", + " 2025-01-21T23:44:04.700038\n", + " 0.629441\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", " []\n", " \n", " \n", - " 2\n", + " 3\n", " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6\n", + " b895a249-e72d-4096-85fa-e0606d61aebf\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-21T00:15:24.915376\n", - " 1.019706\n", + " 2025-01-21T23:44:01.917104\n", + " 0.993879\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", " []\n", " \n", - " \n", - " 3\n", - " lorem.pdf\n", - " Lorem ipsum Lorem ipsum Lorem ipsum\n", - " 1\n", - " 0\n", - " 2\n", - " 8a86db6e-0ac6-480c-9a86-d4ac383e2589\n", - " 6571294142213095721\n", - " pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 35\n", - " 2025-01-21T00:15:27.139779\n", - " 0.625818\n", - " lorem.pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 3\n", - " []\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " filename contents \\\n", - "0 spam.pdf Free xxx \n", - "1 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "2 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "3 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", - "1 1 0 11 \n", + "1 1 0 2 \n", "2 1 0 11 \n", - "3 1 0 2 \n", + "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", - "1 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", - "2 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", - "3 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", + "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", + "2 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", + "3 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", "\n", " hash size \\\n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "3 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-21T00:15:28.419184 0.660825 spam.pdf \n", - "1 2025-01-21T00:15:27.756719 0.615258 mars.pdf \n", - "2 2025-01-21T00:15:24.915376 1.019706 earth-copy.pdf \n", - "3 2025-01-21T00:15:27.139779 0.625818 lorem.pdf \n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", + "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", + "2 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", + "3 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", - "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", - "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] \n", - "3 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] " + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1770,7 +1790,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "b485f598", "metadata": {}, "outputs": [ @@ -1786,27 +1806,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "00:15:28 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "00:15:28 INFO - data factory docq_ is using local configuration without input/output path\n", - "00:15:28 INFO - data factory docq_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - pipeline id pipeline_id\n", - "00:15:28 INFO - code location None\n", - "00:15:28 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "00:15:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "00:15:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "00:15:28 INFO - orchestrator docq started at 2025-01-21 00:15:28\n", - "00:15:28 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.04009246826171875}\n", - "00:15:28 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "00:15:28 INFO - Completed 1 files (20.0%) in 0.0 min\n", - "00:15:28 WARNING - table is empty, skipping processing\n", - "00:15:28 INFO - Completed 2 files (40.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 3 files (60.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 4 files (80.0%) in 0.0 min\n", - "00:15:28 INFO - Completed 5 files (100.0%) in 0.0 min\n", - "00:15:28 INFO - Done processing 5 files, waiting for flush() completion.\n", - "00:15:28 INFO - done flushing in 0.0 sec\n", - "00:15:28 INFO - Completed execution in 0.0 min, execution result 0\n" + "23:44:05 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "23:44:05 INFO - data factory docq_ is using local configuration without input/output path\n", + "23:44:05 INFO - data factory docq_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - pipeline id pipeline_id\n", + "23:44:05 INFO - code location None\n", + "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:44:05 INFO - orchestrator docq started at 2025-01-21 23:44:05\n", + "23:44:05 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "23:44:05 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "23:44:05 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "23:44:05 WARNING - table is empty, skipping processing\n", + "23:44:05 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "23:44:05 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "23:44:05 INFO - Done processing 5 files, waiting for flush() completion.\n", + "23:44:05 INFO - done flushing in 0.0 sec\n", + "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1814,8 +1834,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 38.8 ms, sys: 2.53 ms, total: 41.3 ms\n", - "Wall time: 35.3 ms\n" + "CPU times: user 31.5 ms, sys: 2.81 ms, total: 34.3 ms\n", + "Wall time: 28.4 ms\n" ] } ], @@ -1856,7 +1876,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "1f3225f8", "metadata": {}, "outputs": [ @@ -1914,12 +1934,36 @@ " \n", " \n", " 0\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " ...\n", + " 5.000000\n", + " 0.000000\n", + " 1\n", + " 0.085714\n", + " 0.0\n", + " False\n", + " 0.000000\n", + " 0.0\n", + " 1.000000\n", + " False\n", + " \n", + " \n", + " 1\n", " spam.pdf\n", " Free xxx\n", " 1\n", " 0\n", " 2\n", - " 5a9d562d-ba87-4b2f-954d-fd9e7aece509\n", + " 518a2e39-5c85-400f-8864-6bbc3ef20b1e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -1937,13 +1981,13 @@ " False\n", " \n", " \n", - " 1\n", + " 2\n", " mars.pdf\n", " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18\n", + " 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -1961,13 +2005,13 @@ " True\n", " \n", " \n", - " 2\n", + " 3\n", " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6\n", + " b895a249-e72d-4096-85fa-e0606d61aebf\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -1984,88 +2028,64 @@ " 0.880734\n", " True\n", " \n", - " \n", - " 3\n", - " lorem.pdf\n", - " Lorem ipsum Lorem ipsum Lorem ipsum\n", - " 1\n", - " 0\n", - " 2\n", - " 8a86db6e-0ac6-480c-9a86-d4ac383e2589\n", - " 6571294142213095721\n", - " pdf\n", - " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 35\n", - " ...\n", - " 5.000000\n", - " 0.000000\n", - " 1\n", - " 0.085714\n", - " 0.0\n", - " False\n", - " 0.000000\n", - " 0.0\n", - " 1.000000\n", - " False\n", - " \n", " \n", "\n", "

4 rows × 27 columns

\n", "" ], "text/plain": [ - " filename contents \\\n", - "0 spam.pdf Free xxx \n", - "1 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "2 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "3 lorem.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", - "1 1 0 11 \n", + "1 1 0 2 \n", "2 1 0 11 \n", - "3 1 0 2 \n", + "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5a9d562d-ba87-4b2f-954d-fd9e7aece509 10026122586747302274 pdf \n", - "1 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", - "2 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", - "3 8a86db6e-0ac6-480c-9a86-d4ac383e2589 6571294142213095721 pdf \n", + "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", + "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", + "2 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", + "3 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", - "0 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", - "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", - "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", - "3 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", "\n", " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", - "0 3.500000 0.000000 1 \n", - "1 4.688000 0.032000 8 \n", - "2 4.541284 0.027523 9 \n", - "3 5.000000 0.000000 1 \n", + "0 5.000000 0.000000 1 \n", + "1 3.500000 0.000000 1 \n", + "2 4.688000 0.032000 8 \n", + "3 4.541284 0.027523 9 \n", "\n", " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", - "0 0.000000 0.0 True \n", - "1 0.000000 0.0 False \n", + "0 0.085714 0.0 False \n", + "1 0.000000 0.0 True \n", "2 0.000000 0.0 False \n", - "3 0.085714 0.0 False \n", + "3 0.000000 0.0 False \n", "\n", " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", "0 0.000000 0.0 \n", - "1 0.176471 0.0 \n", + "1 0.000000 0.0 \n", "2 0.176471 0.0 \n", - "3 0.000000 0.0 \n", + "3 0.176471 0.0 \n", "\n", " docq_alphabet_word_ratio docq_contain_common_en_words \n", "0 1.000000 False \n", - "1 0.880000 True \n", - "2 0.880734 True \n", - "3 1.000000 False \n", + "1 1.000000 False \n", + "2 0.880000 True \n", + "3 0.880734 True \n", "\n", "[4 rows x 27 columns]" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2092,7 +2112,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "5dac1c70", "metadata": {}, "outputs": [ @@ -2142,13 +2162,13 @@ " \n", " \n", " \n", - " 1\n", + " 2\n", " mars.pdf\n", " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18\n", + " 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2166,13 +2186,13 @@ " True\n", " \n", " \n", - " 2\n", + " 3\n", " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6\n", + " b895a249-e72d-4096-85fa-e0606d61aebf\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2196,41 +2216,41 @@ ], "text/plain": [ " filename contents \\\n", - "1 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "2 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", - "1 1 0 11 \n", "2 1 0 11 \n", + "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "1 0dd165a1-6de2-4df0-ad7a-c6ad21da9c18 7758129997476962679 pdf \n", - "2 27d9fd2f-d815-4937-bbbf-d4a10cbce4c6 14711865278795535908 pdf \n", + "2 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", + "3 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", - "1 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", - "2 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", "\n", " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", - "1 4.688000 0.032000 8 \n", - "2 4.541284 0.027523 9 \n", + "2 4.688000 0.032000 8 \n", + "3 4.541284 0.027523 9 \n", "\n", " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", - "1 0.0 0.0 False \n", "2 0.0 0.0 False \n", + "3 0.0 0.0 False \n", "\n", " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", - "1 0.176471 0.0 \n", "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", "\n", " docq_alphabet_word_ratio docq_contain_common_en_words \n", - "1 0.880000 True \n", - "2 0.880734 True \n", + "2 0.880000 True \n", + "3 0.880734 True \n", "\n", "[2 rows x 27 columns]" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2239,12 +2259,12 @@ "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", "\n", "# remove documents with badwords\n", - "clean_docs = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", + "clean_docs_df = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", "\n", "# also filter out 'lorem ipsum' text\n", - "clean_docs = clean_docs[clean_docs['docq_lorem_ipsum_ratio'] == 0]\n", + "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", "\n", - "clean_docs.head()" + "clean_docs_df.head()" ] }, { @@ -2259,7 +2279,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", "metadata": { "colab": { @@ -2273,18 +2293,67 @@ "name": "stdout", "output_type": "stream", "text": [ - "✅ Saved output to 'output/output_final'\n" + "✅ Saved parquet output to 'output/output_final/pq'\n" ] } ], "source": [ "import shutil\n", "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER_FINAL, exist_ok=True)\n", + "shutil.rmtree(output_final_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_final_dir, exist_ok=True)\n", + "\n", + "output_final_dir_parquet = os.path.join (output_final_dir, 'pq')\n", + "shutil.os.makedirs(output_final_dir_parquet, exist_ok=True)\n", "\n", - "clean_docs.to_parquet(os.path.join(MY_CONFIG.OUTPUT_FOLDER_FINAL, \"final.parquet\"))\n", - "print (f\"✅ Saved output to '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + "output_final_dir_markdown = os.path.join (output_final_dir, 'markdown')\n", + "shutil.os.makedirs(output_final_dir_markdown, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e06ce4f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Saved CLEAN parquet output to 'output/output_final/pq'\n" + ] + } + ], + "source": [ + "## save parquet\n", + "\n", + "clean_docs_df.to_parquet(os.path.join(output_final_dir_parquet, \"clean_docs.parquet\"))\n", + "print (f\"✅ Saved CLEAN parquet output to '{output_final_dir_parquet}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1e175302", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Saved CLEAN markdown output to 'output/output_final/markdown'\n" + ] + } + ], + "source": [ + "## save markdown text\n", + "\n", + "for index, row in clean_docs_df.iterrows():\n", + " output_file_name = os.path.join (output_final_dir_markdown, row['filename'] + '.md')\n", + " with open(output_file_name, 'w') as output_file:\n", + " output_file.write(row['contents'])\n", + " \n", + "print (f\"✅ Saved CLEAN markdown output to '{output_final_dir_markdown}'\")\n" ] } ], From b1f6701f686202560bb37710f55829a34d7b3878 Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Wed, 22 Jan 2025 22:22:54 -0800 Subject: [PATCH 3/6] updated to run on Google colab Signed-off-by: Sujee Maniyam --- examples/notebooks/pdf-processing-1/README.md | 27 +- .../pdf_processing_1_python.ipynb | 5425 +++++++++-------- 2 files changed, 2799 insertions(+), 2653 deletions(-) diff --git a/examples/notebooks/pdf-processing-1/README.md b/examples/notebooks/pdf-processing-1/README.md index 84d8e15c6..dc63ecd34 100644 --- a/examples/notebooks/pdf-processing-1/README.md +++ b/examples/notebooks/pdf-processing-1/README.md @@ -1,8 +1,18 @@ # PDF Processing with Data Prep Kit -Show cases Data Prep Kit capabilities of processing PDFs +Show cases Data Prep Kit capabilities of processing PDFs. -## Running the code +We will demonstrate the following: + +- Extracting text from PDF files +- removing duplicates (exact and fuzzy matches) +- accessing document quality and removing documents containing spam words, placeholder content like 'lorem ipsum' ..etc. + +**Workflow** + +![](images/data-prep-kit-3-workflow.png) + +## Setting up Python Environment The code can be run on either @@ -14,7 +24,7 @@ conda create -n data-prep-kit -y python=3.11 conda activate data-prep-kit # install the following in 'data-prep-kit' environment -pip3 install 'data-prep-toolkit-transforms[ray,all]==1.0.0a4' +pip3 install 'data-prep-toolkit-transforms[ray,all]==1.0.0' pip3 install jupyterlab ipykernel ipywidgets ## install custom kernel @@ -25,19 +35,20 @@ python -m ipykernel install --user --name=data-prep-kit --display-name "dataprep jupyter lab ``` -## Intro +## Running the code -This notebook will demonstrate processing PDFs -`PDFs ---> text ---> compute hash ---> dedupe ---> document quality` +[python version](pdf_processing_1_python.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb) -[python version](dpk_intro_1_python.ipynb)   |   [ray version](dpk_intro_1_ray.ipynb) +[ray version](pdf_processing_1_ray.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb) ## Creating Input PDFs (Optional) +Sample PDFs we use for this example are created from markdown documents using pandoc utility, as follows. + ```bash -cd input/solar-system +cd input pandoc earth.md -o earth.pdf pandoc earth2.md -o earth2.pdf diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb index f5c3edb95..30ba3316c 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb @@ -1,2730 +1,2865 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", - "metadata": { - "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" - }, - "source": [ - "# Processing PDFs using Data Prep Kit\n", - "\n", - "This notebook will introduce DPK and showcase some of it's capabilities.\n", - "\n", - "Here is the workflow:\n", - "\n", - "- pdf2parquet: Extract text from PDF documents\n", - "- docid: compute hashes\n", - "- exact dedupe : filter out identical documents\n", - "- fuzzy dedupe : filter out 'near duplicates'\n", - "- document quality: scoring documents for quality\n", - "\n", - "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" - ] - }, - { - "cell_type": "markdown", - "id": "b15976e3", - "metadata": { - "id": "b15976e3" - }, - "source": [ - "## How to run this notebook\n", - "\n", - "Two options:\n", - "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/dpk_intro_1_python.ipynb)\n", - "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", - "\n", - "The notebook will work as in both environments" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, - "source": [ - "## Step-1: Figure out Runtime Environment\n", - "\n", - "### 1.1 - Determine runtime\n", - "\n", - "Determine if we are running on Google colab or local python environment" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": { + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" + }, + "source": [ + "# Processing PDFs using Data Prep Kit\n", + "\n", + "This notebook will introduce DPK and showcase some of it's capabilities.\n", + "\n", + "Here is the workflow:\n", + "\n", + "- pdf2parquet: Extract text from PDF documents\n", + "- docid: compute hashes\n", + "- exact dedupe : filter out identical documents\n", + "- fuzzy dedupe : filter out 'near duplicates'\n", + "- document quality: scoring documents for quality\n", + "\n", + "![](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" + ] }, - "id": "1fe354b7", - "outputId": "5c153f72-08ed-4d6e-ccc7-dae851e7fd8b" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 1.2 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 + "cell_type": "markdown", + "id": "b15976e3", + "metadata": { + "id": "b15976e3" + }, + "source": [ + "## How to run this notebook\n", + "\n", + "Two options:\n", + "\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb)\n", + "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", + "\n", + "The notebook will work as in both environments" + ] }, - "id": "1fcec577", - "outputId": "0f77fc39-ffeb-48da-ce6f-1750d8d3ad62" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit-transforms[ray,all]==1.0.0a4" - ] - }, - { - "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, - "source": [ - "### 1.3 - Restart Runtime\n", - "\n", - "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", - "\n", - "You do this by going to **`Runtime --> Restart Session`**\n", - "\n", - "Then you can continue to the next step (no need to re-run the notebook)" - ] - }, - { - "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration & Utils" - ] - }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, - "source": [ - "### 2.1 - Basic Config" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "markdown", + "id": "39a0ab6e", + "metadata": { + "id": "39a0ab6e" + }, + "source": [ + "## Step-1: Figure out Runtime Environment\n", + "\n", + "### 1.1 - Determine runtime\n", + "\n", + "Determine if we are running on Google colab or local python environment" + ] }, - "id": "e4YMZrBuFycl", - "outputId": "d7ee9449-4f21-4c9a-fa54-14b7f28d764a" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT in Colab\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", - " print(\"Running in Colab\")\n", - " RUNNING_IN_COLAB = True\n", - "else:\n", - " print(\"NOT in Colab\")\n", - " RUNNING_IN_COLAB = False" - ] - }, - { - "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, - "source": [ - "### 2.2 - Setup input/outpur directories" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 1, + "id": "1fe354b7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1fe354b7", + "outputId": "39cc4e90-b230-4100-92c9-3aa3d977fa3d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "4d5511fb-1c6f-47df-e5ea-2c1b354d262f" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Cleared output directory\n" - ] - } - ], - "source": [ - "import os, sys\n", - "import shutil\n", - "\n", - "input_dir = \"input\"\n", - "shutil.os.makedirs(input_dir, exist_ok=True)\n", - "output_dir = \"output\"\n", - "\n", - "output_text_dir = os.path.join (output_dir, '01_text_out')\n", - "output_docid_dir = os.path.join (output_dir, '02_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (output_dir, '03_exact_dedupe_out')\n", - "output_fuzzy_dedupe_dir = os.path.join (output_dir, '04_fuzzy_dedupe_out')\n", - "output_doc_quality_dir = os.path.join (output_dir, '05_doc_quality_out')\n", - "output_final_dir = os.path.join (output_dir, 'output_final')\n", - "\n", - "## clear output folder\n", - "shutil.rmtree(output_dir, ignore_errors=True)\n", - "shutil.os.makedirs(output_dir, exist_ok=True)\n", - "print (\"✅ Cleared output directory\")" - ] - }, - { - "cell_type": "markdown", - "id": "14b2f34c", - "metadata": {}, - "source": [ - "### 2.3 - Handy Utils" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "ba47a370", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import requests\n", - "from humanfriendly import format_size\n", - "import pandas as pd\n", - "import glob\n", - "\n", - "## Reads parquet files in a folder into a pandas dataframe\n", - "def read_parquet_files_as_df (parquet_dir):\n", - " parquet_files = glob.glob(f'{parquet_dir}/*.parquet')\n", - " # read each parquet file into a DataFrame and store in a list\n", - " dfs = [pd.read_parquet (f) for f in parquet_files]\n", - " dfs = [df for df in dfs if not df.empty] # filter out empty dataframes\n", - " # Concatenate all DataFrames into a single DataFrame\n", - " if len(dfs) > 0:\n", - " data_df = pd.concat(dfs, ignore_index=True)\n", - " return data_df\n", - " else:\n", - " return pd.DataFrame() # return empty df\n", - "# ------------\n", - "\n", - "\n", - "def download_file(url, local_file, chunk_size=1024*1024):\n", - " \"\"\"\n", - " Downloads a remote URL to a local file.\n", - "\n", - " Args:\n", - " url (str): The remote URL.\n", - " local_filename (str): The name of the local file to save the downloaded content.\n", - " chunk_size (int): The size in bytes of each chunk. Defaults to 1024.\n", - "\n", - " Returns:\n", - " None\n", - " \n", - " Example usage:\n", - " download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB\n", - " \"\"\"\n", - " # Check if the local file already exists\n", - " if os.path.exists(local_file):\n", - " file_size = format_size(os.path.getsize(local_file))\n", - " print(f\"Local file '{local_file}' ({file_size}) already exists. Skipping download.\")\n", - " return\n", - "\n", - " # Create the directory if it doesn't exist\n", - " os.makedirs(os.path.dirname(local_file), exist_ok=True)\n", - "\n", - " # Stream the file download\n", - " with requests.get(url, stream=True) as r:\n", - " r.raise_for_status()\n", - " with open(local_file, 'wb') as f:\n", - " for chunk in r.iter_content(chunk_size=chunk_size):\n", - " if chunk: # filter out keep-alive new chunks\n", - " f.write(chunk)\n", - " print()\n", - " file_size = format_size(os.path.getsize(local_file))\n", - " print(f\"{local_file} ({file_size}) downloaded successfully.\")\n", - "## --- end: download_file ------\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "dc1972c3", - "metadata": {}, - "source": [ - "## Step-3: Inspect the Data\n", - "\n", - "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/input/)\n", - "\n", - "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/earth.pdf) and exact duplicate [earth-copy.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/earth-copy.pdf)\n", - "- [earth2.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", - "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/mars.pdf)\n", - "- [spam.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/spam.pdf) - contains spammy contents\n", - "- [lorem.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/pdf-processing-1/input/lorem.pdf) - contains 'lorem ipsum' placeholder\n" - ] - }, - { - "cell_type": "markdown", - "id": "7113b16c", - "metadata": {}, - "source": [ - "### 3.1 -Download Data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "23db1064", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "a5dc2b68", + "metadata": { + "id": "a5dc2b68" + }, + "source": [ + "### 1.2 - Install dependencies if running on Google Colab" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Local file 'input/earth.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/earth-copy.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/earth2.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/mars.pdf' (57.87 KB) already exists. Skipping download.\n", - "Local file 'input/spam.pdf' (24.87 KB) already exists. Skipping download.\n", - "Local file 'input/lorem-ipsum.pdf' (25.72 KB) already exists. Skipping download.\n" - ] - } - ], - "source": [ - "\n", - "download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", - "\n", - "download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", - "\n", - "download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", - "\n", - "download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", - "\n", - "download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", - "\n", - "download_file ('https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))" - ] - }, - { - "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, - "source": [ - "## Step-4: Extract Data from PDF (pdf2parquet)\n", - "\n", - "This step we will read PDF files and extract the text data.\n", - "\n", - "[Pdf2Parquet documentation](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/README.md)\n", - "\n", - "We use the [Docling package](https://github.com/DS4SD/docling).\n" - ] - }, - { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 4.1 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 657, - "referenced_widgets": [ - "97b603697cfa4b4ea4e6735b6768ca35", - "e87e8d3262c54cfaaa8768505edacda3", - "b78aa40816e44f7fbebcb24ca68818b3", - "7053c9606a414e978636a7e241909504", - "da0787b239764847a731083997780a85", - "553f3c16839a49d79591d0fc4862bed6", - "c0eb5bc8f6ee427ca42204b3c56f9a4e", - "9d184ed175f0403fb03c2e13dfd04e0a", - "724778729161445c98b187031ae4f67c", - "1cb3bbf7d724411cbe9831543a4aecc0", - "06f9b33494984e4885d5aad813d1d2bc" - ] + "cell_type": "code", + "execution_count": 2, + "id": "1fcec577", + "metadata": { + "id": "1fcec577" + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "if RUNNING_IN_COLAB:\n", + " ! pip install --default-timeout=100 \\\n", + " data-prep-toolkit-transforms[all]==1.0.0 \\\n", + " humanfriendly" + ] }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "01d207fb-983d-40b2-e5f6-e38e3789110a" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_text_out'\n", - "\n" - ] + "cell_type": "markdown", + "id": "243322b8", + "metadata": { + "id": "243322b8" + }, + "source": [ + "### 1.3 - Restart Runtime\n", + "\n", + "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", + "\n", + "You do this by going to **`Runtime --> Restart Session`**\n", + "\n", + "Then you can continue to the next step (no need to re-run the notebook)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "23:43:57 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "23:43:57 INFO - pipeline id pipeline_id\n", - "23:43:57 INFO - code location None\n", - "23:43:57 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_text_out\n", - "23:43:57 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:43:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "23:43:57 INFO - orchestrator pdf2parquet started at 2025-01-21 23:43:57\n", - "23:43:57 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "23:43:57 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 112347.43it/s]\n", - "23:44:01 INFO - Completed 1 files (16.67%) in 0.017 min\n", - "23:44:02 INFO - Completed 2 files (33.33%) in 0.031 min\n", - "23:44:03 INFO - Completed 3 files (50.0%) in 0.042 min\n", - "23:44:04 INFO - Completed 4 files (66.67%) in 0.052 min\n", - "23:44:04 INFO - Completed 5 files (83.33%) in 0.063 min\n", - "23:44:05 INFO - Completed 6 files (100.0%) in 0.073 min\n", - "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.126 min, execution result 0\n" - ] + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": { + "id": "e8b10be1" + }, + "source": [ + "## Step-2: Configuration & Utils" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:1 completed successfully\n", - "CPU times: user 20.3 s, sys: 1.99 s, total: 22.3 s\n", - "Wall time: 10.3 s\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "from dpk_pdf2parquet.transform_python import Pdf2Parquet\n", - "from dpk_pdf2parquet.transform import pdf2parquet_contents_types\n", - "\n", - "STAGE = 1 \n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_dir}' --> output='{output_text_dir}'\\n\", flush=True)\n", - "\n", - "result = Pdf2Parquet(input_folder= input_dir,\n", - " output_folder= output_text_dir,\n", - " data_files_to_use=['.pdf'],\n", - " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", - " # pdf2parquet_contents_type=pdf2parquet_contents_types.JSON # JSON\n", - " ).transform()\n", - "\n", - "if result == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (f\"❌ Stage:{STAGE} failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "5ca790e0", - "metadata": { - "id": "5ca790e0" - }, - "source": [ - "### 4.2 - Inspect Generated output\n", - "\n", - "Here we should see one entry per input file processed." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "fe59563d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 + "cell_type": "markdown", + "id": "356c66f7", + "metadata": { + "id": "356c66f7" + }, + "source": [ + "### 2.1 - Basic Config" + ] }, - "id": "fe59563d", - "outputId": "346e0584-bdde-4705-8c2a-f3c1582cd7e7" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Displaying contents of : output/01_text_out\n" - ] + "cell_type": "code", + "execution_count": 3, + "id": "e4YMZrBuFycl", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e4YMZrBuFycl", + "outputId": "ad7fc57a-5229-4841-8d8a-23272aa5197d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT in Colab\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " print(\"Running in Colab\")\n", + " RUNNING_IN_COLAB = True\n", + "else:\n", + " print(\"NOT in Colab\")\n", + " RUNNING_IN_COLAB = False" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": { + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" + }, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filename
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T23:44:04.0670750.636751lorem-ipsum.pdf
1spam.pdfFree xxx102518a2e39-5c85-400f-8864-6bbc3ef20b1e10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T23:44:05.3207660.619056spam.pdf
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011c9accf02-d2ed-4307-b0c4-53a3a369917910729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-21T23:44:03.4286400.620741earth2.pdf
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101137f9901a-f0b3-49c5-b5cd-dbfeb0126cd47758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T23:44:04.7000380.629441mars.pdf
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011b895a249-e72d-4096-85fa-e0606d61aebf14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T23:44:01.9171040.993879earth-copy.pdf
\n", - "
" + "cell_type": "code", + "execution_count": 4, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "outputId": "63d1d197-dfb1-4d6f-eb88-846bbbff1446" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Cleared output directory\n" + ] + } ], - "text/plain": [ - " filename contents \\\n", - "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", - "1 spam.pdf Free xxx \n", - "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "\n", - " num_pages num_tables num_doc_elements \\\n", - "0 1 0 2 \n", - "1 1 0 2 \n", - "2 1 0 11 \n", - "3 1 0 11 \n", - "4 1 0 11 \n", - "\n", - " document_id document_hash ext \\\n", - "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", - "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", - "2 c9accf02-d2ed-4307-b0c4-53a3a3699179 10729312978404042321 pdf \n", - "3 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", - "4 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", - "\n", - " hash size \\\n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", - "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", - "2 2025-01-21T23:44:03.428640 0.620741 earth2.pdf \n", - "3 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", - "4 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf " + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "input_dir = \"input\"\n", + "shutil.os.makedirs(input_dir, exist_ok=True)\n", + "output_dir = \"output\"\n", + "\n", + "output_text_dir = os.path.join (output_dir, '01_text_out')\n", + "output_docid_dir = os.path.join (output_dir, '02_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (output_dir, '03_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (output_dir, '04_fuzzy_dedupe_out')\n", + "output_doc_quality_dir = os.path.join (output_dir, '05_doc_quality_out')\n", + "output_final_dir = os.path.join (output_dir, 'output_final')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(output_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_dir, exist_ok=True)\n", + "print (\"✅ Cleared output directory\")" ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print (\"Displaying contents of : \", output_text_dir)\n", - "output_df = read_parquet_files_as_df(output_text_dir)\n", - "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "output_df.head()\n", - "\n", - "## To display certain columns\n", - "#parquet_df[['column1', 'column2', 'column3']].head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, - "source": [ - "\n", - "### 4.3 - Understand the output\n", - "\n", - "Here are some interesting attributes to note:\n", - "\n", - "- **filename** : original filename\n", - "- **contents** : text\n", - "- **document_id**: unique id (UUID) assignd to this document\n", - "- **document_hash**: hash of documents\n", - "- **hash** : hash of `contents` column\n", - "- **pdf_convert_time** : time to convert this pdf in seconds\n", - "\n", - "**Note: you should notice the hash values are identical for the duplicate documents**\n", - "\n", - "Let's inspect the **contents** column." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "f870e624", - "outputId": "0b4c054f-3a8a-4db3-f32f-17bd1466b102" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lorem ipsum Lorem ipsum Lorem ipsum\n" - ] - } - ], - "source": [ - "print (output_df.iloc[0, ]['contents'])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "id": "14b2f34c", + "metadata": { + "id": "14b2f34c" + }, + "source": [ + "### 2.3 - Handy Utils" + ] }, - "id": "e1a10c2d", - "outputId": "c1d992c2-faa8-40cd-c375-857970201daa" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Free xxx\n" - ] - } - ], - "source": [ - "print (output_df.iloc[1, ]['contents'])\n" - ] - }, - { - "cell_type": "markdown", - "id": "7fc86d5b", - "metadata": {}, - "source": [ - "## Step-5: Create DOC ID for Documents\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", - "\n", - "**This step is a pre-requisite for fuzzy dedup** in the pipeline.\n", - "\n", - "[DocID documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/doc_id)" - ] - }, - { - "cell_type": "markdown", - "id": "f516a253", - "metadata": {}, - "source": [ - "### 5.1 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "cee20521", - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 5, + "id": "ba47a370", + "metadata": { + "id": "ba47a370" + }, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from humanfriendly import format_size\n", + "import pandas as pd\n", + "import glob\n", + "\n", + "## Reads parquet files in a folder into a pandas dataframe\n", + "def read_parquet_files_as_df (parquet_dir):\n", + " parquet_files = glob.glob(f'{parquet_dir}/*.parquet')\n", + " # read each parquet file into a DataFrame and store in a list\n", + " dfs = [pd.read_parquet (f) for f in parquet_files]\n", + " dfs = [df for df in dfs if not df.empty] # filter out empty dataframes\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " if len(dfs) > 0:\n", + " data_df = pd.concat(dfs, ignore_index=True)\n", + " return data_df\n", + " else:\n", + " return pd.DataFrame() # return empty df\n", + "# ------------\n", + "\n", + "\n", + "def download_file(url, local_file, chunk_size=1024*1024):\n", + " \"\"\"\n", + " Downloads a remote URL to a local file.\n", + "\n", + " Args:\n", + " url (str): The remote URL.\n", + " local_filename (str): The name of the local file to save the downloaded content.\n", + " chunk_size (int): The size in bytes of each chunk. Defaults to 1024.\n", + "\n", + " Returns:\n", + " None\n", + "\n", + " Example usage:\n", + " download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB\n", + " \"\"\"\n", + " # Check if the local file already exists\n", + " if os.path.exists(local_file):\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"Local file '{local_file}' ({file_size}) already exists. Skipping download.\")\n", + " return\n", + "\n", + " # Create the directory if it doesn't exist\n", + " os.makedirs(os.path.dirname(local_file), exist_ok=True)\n", + "\n", + " # Stream the file download\n", + " with requests.get(url, stream=True) as r:\n", + " r.raise_for_status()\n", + " with open(local_file, 'wb') as f:\n", + " for chunk in r.iter_content(chunk_size=chunk_size):\n", + " if chunk: # filter out keep-alive new chunks\n", + " f.write(chunk)\n", + " print()\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"{local_file} ({file_size}) downloaded successfully.\")\n", + "## --- end: download_file ------\n", + "\n" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-2: Processing input='output/01_text_out' --> output='output/02_docid_out'\n", - "\n" - ] + "cell_type": "markdown", + "id": "dc1972c3", + "metadata": { + "id": "dc1972c3" + }, + "source": [ + "## Step-3: Inspect the Data\n", + "\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/input/)\n", + "\n", + "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "23:44:05 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/01_text_out output_folder - output/02_docid_out\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator doc_id started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n" - ] + "cell_type": "code", + "execution_count": 27, + "id": "eG_5od2HjQWG", + "metadata": { + "id": "eG_5od2HjQWG" + }, + "outputs": [], + "source": [] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:2 completed successfully\n", - "CPU times: user 15 ms, sys: 8.25 ms, total: 23.3 ms\n", - "Wall time: 18.9 ms\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "from dpk_doc_id.transform_python import DocID\n", - "\n", - "STAGE = 2\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_text_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", - "\n", - "result = DocID(input_folder= output_text_dir,\n", - " output_folder= output_docid_dir,\n", - " doc_id_doc_column= \"contents\",\n", - " doc_id_hash_column= \"doc_hash\",\n", - " # doc_id_int_column= \"doc_id\",\n", - " doc_id_int_column= \"int_id_column\",\n", - " #doc_id_start_id= 5\n", - " ).transform()\n", - "\n", - "if result == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (f\"❌ Stage:{STAGE} failed\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "4bd6f382", - "metadata": {}, - "source": [ - "### 5.2 - Inspect Generated output\n", - "\n", - "You would see a new columns **hash** and **docid** " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "f3d4aba9", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "7113b16c", + "metadata": { + "id": "7113b16c" + }, + "source": [ + "### 3.1 -Download Data" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Displaying contents of : output/02_docid_out\n" - ] + "cell_type": "code", + "execution_count": 6, + "id": "23db1064", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "23db1064", + "outputId": "d871231d-86e2-4db7-a437-1510047bef2a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Local file 'input/earth.pdf' (58.53 KB) already exists. Skipping download.\n", + "Local file 'input/earth-copy.pdf' (58.53 KB) already exists. Skipping download.\n", + "Local file 'input/earth2.pdf' (58.53 KB) already exists. Skipping download.\n", + "Local file 'input/mars.pdf' (57.87 KB) already exists. Skipping download.\n", + "Local file 'input/spam.pdf' (24.87 KB) already exists. Skipping download.\n", + "Local file 'input/lorem-ipsum.pdf' (25.72 KB) already exists. Skipping download.\n" + ] + } + ], + "source": [ + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_column
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T23:44:04.0670750.636751lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3
1spam.pdfFree xxx102518a2e39-5c85-400f-8864-6bbc3ef20b1e10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T23:44:05.3207660.619056spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011c9accf02-d2ed-4307-b0c4-53a3a369917910729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-21T23:44:03.4286400.620741earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101137f9901a-f0b3-49c5-b5cd-dbfeb0126cd47758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T23:44:04.7000380.629441mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011b895a249-e72d-4096-85fa-e0606d61aebf14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T23:44:01.9171040.993879earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0
\n", - "
" + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": { + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" + }, + "source": [ + "## Step-4: Extract Data from PDF (pdf2parquet)\n", + "\n", + "This step we will read PDF files and extract the text data.\n", + "\n", + "[Pdf2Parquet documentation](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/README.md)\n", + "\n", + "We use the [Docling package](https://github.com/DS4SD/docling).\n" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": { + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" + }, + "source": [ + "### 4.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 836, + "referenced_widgets": [ + "df5c199339f5467a91453fa187e201f0", + "257dbf0b62624667b0c82afaf1c8ccf1", + "4e76bef9228546fd97cccfe7bdd856f3", + "c0c37c0262b84e9ebf02c1ce17f263ee", + "ca821137125b45d08e257f95822a6f72", + "fb81f32569c34250b901235698e5ea18", + "1ce164863aa34f64a94aeb5d05103043", + "e2b5f84c30de45d29588a07a3d106eb4", + "cc7d3125eb55461180566d1064eeb2a5", + "68eb811a52804887bc383e89a72a0975", + "55b9873ce1f34c169ecc6087c3cd65a1" + ] + }, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "outputId": "da48c24e-c32c-4fc9-e6aa-37b1921c3d4d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_text_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22:15:10 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "22:15:10 INFO - pipeline id pipeline_id\n", + "22:15:10 INFO - code location None\n", + "22:15:10 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_text_out\n", + "22:15:10 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:15:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "22:15:10 INFO - orchestrator pdf2parquet started at 2025-01-22 22:15:10\n", + "22:15:10 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "22:15:10 INFO - Initializing models\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4b24e260fadc41a3a56914a06dd1f568", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 9 files: 0%| | 0/9 [00:00 output='{output_text_dir}'\\n\", flush=True)\n", + "\n", + "result = Pdf2Parquet(input_folder= input_dir,\n", + " output_folder= output_text_dir,\n", + " data_files_to_use=['.pdf'],\n", + " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")" ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print (\"Displaying contents of : \", output_docid_dir)\n", - "output_df = read_parquet_files_as_df(output_docid_dir)\n", - "output_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "c55f8d3f", - "metadata": {}, - "source": [ - "## Step-6: Eliminate Duplicate Documents\n", - "\n", - "We have 2 exact duplicates: **earth.pdf** , **earth-copy.pdf**\n", - "\n", - "Note how **doc_hash** for these documents are the same.\n", - "\n", - "[Exact dedupe information](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/ededup)" - ] - }, - { - "cell_type": "markdown", - "id": "6f5ef1f7", - "metadata": {}, - "source": [ - "### 6.1 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "90eddb4c", - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-3: Processing input='output/02_docid_out' --> output='output/03_exact_dedupe_out'\n", - "\n" - ] + "cell_type": "markdown", + "id": "5ca790e0", + "metadata": { + "id": "5ca790e0" + }, + "source": [ + "### 4.2 - Inspect Generated output\n", + "\n", + "Here we should see one entry per input file processed." + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "23:44:05 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator ededup started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "23:44:05 INFO - Starting from the beginning\n", - "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n" - ] + "cell_type": "code", + "execution_count": 8, + "id": "fe59563d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 557 + }, + "id": "fe59563d", + "outputId": "81b70c9f-cc39-4f78-f29f-f81d4fcf19ae" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/01_text_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filename
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102a8502d17-692c-4c88-a2a4-19a19ba078926571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-22T22:16:14.0714530.706817lorem-ipsum.pdf
1spam.pdfFree xxx10208f28dfa-e607-4c47-b9d6-66b7b8c193b710026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-22T22:16:15.3721960.631735spam.pdf
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10111e279ae8-df6a-4b07-8500-6f0a564f352b10729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-22T22:16:13.3630450.796537earth2.pdf
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10113685b6ed-bd33-49f3-95a5-806d28e8311b7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-22T22:16:14.7388180.665504mars.pdf
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115a070315-684b-481f-9c9f-76903a720d4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-22T22:16:11.7239871.010865earth-copy.pdf
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", + "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", + "2 1e279ae8-df6a-4b07-8500-6f0a564f352b 10729312978404042321 pdf \n", + "3 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", + "4 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", + "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", + "2 2025-01-22T22:16:13.363045 0.796537 earth2.pdf \n", + "3 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", + "4 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Displaying contents of : \", output_text_dir)\n", + "output_df = read_parquet_files_as_df(output_text_dir)\n", + "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "output_df.head()\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:3 completed successfully\n", - "CPU times: user 20.5 ms, sys: 5.55 ms, total: 26.1 ms\n", - "Wall time: 20.3 ms\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "from dpk_ededup.transform_python import Ededup\n", - "\n", - "STAGE = 3\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", - "\n", - "result = Ededup(input_folder=output_docid_dir,\n", - " output_folder=output_exact_dedupe_dir,\n", - " ededup_doc_column=\"contents\",\n", - " ededup_doc_id_column=\"doc_hash\"\n", - " ).transform()\n", - "\n", - "if result == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (f\"❌ Stage:{STAGE} failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "f4aacf09", - "metadata": {}, - "source": [ - "### 6.2 - Inspect Generated output\n", - "\n", - "You can see one of **earth.pdf** or **earth-copy.pdf** will be eliminated." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1887b26d", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "e5058a21", + "metadata": { + "id": "e5058a21" + }, + "source": [ + "\n", + "### 4.3 - Understand the output\n", + "\n", + "Here are some interesting attributes to note:\n", + "\n", + "- **filename** : original filename\n", + "- **contents** : text\n", + "- **document_id**: unique id (UUID) assignd to this document\n", + "- **document_hash**: hash of documents\n", + "- **hash** : hash of `contents` column\n", + "- **pdf_convert_time** : time to convert this pdf in seconds\n", + "\n", + "**Note: you should notice the hash values are identical for the duplicate documents**\n", + "\n", + "Let's inspect the **contents** column." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input files before exact dedupe : 6\n", - "Output files after exact dedupe : 5\n", - "Duplicate files removed : 1\n", - "Displaying contents of : output/03_exact_dedupe_out\n" - ] + "cell_type": "code", + "execution_count": 9, + "id": "f870e624", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f870e624", + "outputId": "8064d9df-c226-4795-b9ad-34d50709a8c3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lorem ipsum Lorem ipsum Lorem ipsum\n" + ] + } + ], + "source": [ + "print (output_df.iloc[0, ]['contents'])" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T23:44:04.0670750.636751lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx102518a2e39-5c85-400f-8864-6bbc3ef20b1e10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T23:44:05.3207660.619056spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011c9accf02-d2ed-4307-b0c4-53a3a369917910729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-21T23:44:03.4286400.620741earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2[]
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101137f9901a-f0b3-49c5-b5cd-dbfeb0126cd47758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T23:44:04.7000380.629441mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011b895a249-e72d-4096-85fa-e0606d61aebf14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T23:44:01.9171040.993879earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
\n", - "
" + "cell_type": "code", + "execution_count": 10, + "id": "e1a10c2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e1a10c2d", + "outputId": "3dbf4e39-1c4c-443e-968c-32aae9010165" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Free xxx\n" + ] + } ], - "text/plain": [ - " filename contents \\\n", - "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", - "1 spam.pdf Free xxx \n", - "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "\n", - " num_pages num_tables num_doc_elements \\\n", - "0 1 0 2 \n", - "1 1 0 2 \n", - "2 1 0 11 \n", - "3 1 0 11 \n", - "4 1 0 11 \n", - "\n", - " document_id document_hash ext \\\n", - "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", - "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", - "2 c9accf02-d2ed-4307-b0c4-53a3a3699179 10729312978404042321 pdf \n", - "3 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", - "4 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", - "\n", - " hash size \\\n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", - "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", - "2 2025-01-21T23:44:03.428640 0.620741 earth2.pdf \n", - "3 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", - "4 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf \n", - "\n", - " doc_hash int_id_column removed \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", - "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " + "source": [ + "print (output_df.iloc[1, ]['contents'])\n" ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "input_df = read_parquet_files_as_df(output_docid_dir)\n", - "output_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", - "\n", - "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", - "output_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "76ea34e2", - "metadata": {}, - "source": [ - "## Step-7: Fuzzy Dedupe\n", - "\n", - "In previous step, we removed **exact duplicates (identical documents)**.\n", - "\n", - "Fuzzy de-dupe can further filter out documents that are **not exactly identical, but nearly identical**\n", - "\n", - "For example imagine two documents with one extra blank line. For our purposes they are the same.\n", - "\n", - "[Fuzzy dedupe documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/fdedup)\n", - "\n", - "### Tweaking the threshold\n", - "\n", - "**`jaccard_similarity_threshold`** is the parameter used to tweak similarities between documents. It's value is between 0 and 1.0. Values close to 1.0 means more strict checking (fewer documents will qualify). Lower threshold means more leniant matches (more documents will qualify)\n", - "\n", - "Adjust this value to find what works for your documents" - ] - }, - { - "cell_type": "markdown", - "id": "79a37713", - "metadata": {}, - "source": [ - "### 7.1 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "37430b60", - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-4: Processing input='output/03_exact_dedupe_out' --> output='output/04_fuzzy_dedupe_out'\n", - "\n" - ] + "cell_type": "markdown", + "id": "7fc86d5b", + "metadata": { + "id": "7fc86d5b" + }, + "source": [ + "## Step-5: Create DOC ID for Documents\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "\n", + "**This step is a pre-requisite for fuzzy dedup** in the pipeline.\n", + "\n", + "[DocID documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/doc_id)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "23:44:05 INFO - Starting SignatureCalculation step\n", - "23:44:05 INFO - Got parameters for SignatureCalculation\n", - "23:44:05 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "23:44:05 INFO - data factory scdata_ is using local configuration without input/output path\n", - "23:44:05 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator minhash started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:44:05 WARNING - table is empty, skipping processing\n", - "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:44:05 INFO - Starting flush()\n", - "23:44:05 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", - "23:44:05 INFO - done flushing in 0.021 sec\n", - "23:44:05 INFO - Completed execution in 0.001 min, execution result 0\n", - "23:44:05 INFO - SignatureCalculation completed successfully\n", - "23:44:05 INFO - Starting ClusterAnalysis step\n", - "23:44:05 INFO - Got parameters for ClusterAnalysis\n", - "23:44:05 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator cluster started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of folders is 14\n", - "23:44:05 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "23:44:05 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "23:44:05 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "23:44:05 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "23:44:05 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "23:44:05 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "23:44:05 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 8 files (57.14%) in 0.0 min\n", - "23:44:05 INFO - Completed 9 files (64.29%) in 0.0 min\n", - "23:44:05 INFO - Completed 10 files (71.43%) in 0.0 min\n", - "23:44:05 INFO - Completed 11 files (78.57%) in 0.0 min\n", - "23:44:05 INFO - Completed 12 files (85.71%) in 0.0 min\n", - "23:44:05 INFO - Completed 13 files (92.86%) in 0.0 min\n", - "23:44:05 INFO - Completed 14 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 14 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n", - "23:44:05 INFO - ClusterAnalysis completed successfully\n", - "23:44:05 INFO - Starting GetDuplicateList step\n", - "23:44:05 INFO - Got parameters for GetDuplicateList\n", - "23:44:05 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator fdlist started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of folders is 1\n", - "23:44:05 INFO - Get Duplicate List for folder docs_to_remove\n", - "23:44:05 INFO - 1 documents marked as duplicates\n", - "23:44:05 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 1 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n", - "23:44:05 INFO - GetDuplicateList completed successfully\n", - "23:44:05 INFO - Starting DataCleaning step\n", - "23:44:05 INFO - Got parameters for DataCleaning\n", - "23:44:05 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "23:44:05 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "23:44:05 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator fdclean started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "23:44:05 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "23:44:05 WARNING - table is empty, skipping processing\n", - "23:44:05 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "23:44:05 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "23:44:05 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 6 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n", - "23:44:05 INFO - DataCleaning completed successfully\n" - ] + "cell_type": "markdown", + "id": "f516a253", + "metadata": { + "id": "f516a253" + }, + "source": [ + "### 5.1 - Execute" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 311 ms, sys: 72.2 ms, total: 383 ms\n", - "Wall time: 271 ms\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "from dpk_fdedup.transform_python import Fdedup\n", - "\n", - "STAGE = 4\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_exact_dedupe_dir}' --> output='{output_fuzzy_dedupe_dir}'\\n\", flush=True)\n", - "\n", - "result = Fdedup(input_folder=output_exact_dedupe_dir,\n", - " output_folder=output_fuzzy_dedupe_dir,\n", - " contents_column= \"contents\",\n", - " # document_id_column= \"doc_id\",\n", - " document_id_column= \"int_id_column\",\n", - " num_permutations= 112,\n", - " num_bands= 14,\n", - " num_minhashes_per_band= 8,\n", - " jaccard_similarity_threshold = 0.8, # between 0 - 1. higher means more strict checking\n", - " operation_mode=\"filter_duplicates\",\n", - " # operation_mode=\"annotate\",\n", - " ).transform()\n", - "# if result == 0:\n", - "# print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "# else:\n", - "# raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" - ] - }, - { - "cell_type": "markdown", - "id": "b2c83592", - "metadata": {}, - "source": [ - "### 7.2 - Inspect Output\n", - "\n", - "FuzzyDedupe will write documents that are filtered in **output/04_fuzzy_dedupe_out/cleaned** folder\n", - "\n", - "You will notice only one **earth.pdf** made it! So fuzzy dedupe did filter out the almost identical doc." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "573faba2", - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 11, + "id": "cee20521", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cee20521", + "outputId": "dd568017-e39c-4524-cdcf-6c97a1341ab9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-2: Processing input='output/01_text_out' --> output='output/02_docid_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22:16:15 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/01_text_out output_folder - output/02_docid_out\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator doc_id started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:16:15 INFO - done flushing in 0.0 sec\n", + "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:2 completed successfully\n", + "CPU times: user 26.1 ms, sys: 5.22 ms, total: 31.3 ms\n", + "Wall time: 25.3 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_doc_id.transform_python import DocID\n", + "\n", + "STAGE = 2\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_text_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", + "\n", + "result = DocID(input_folder= output_text_dir,\n", + " output_folder= output_docid_dir,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"doc_hash\",\n", + " # doc_id_int_column= \"doc_id\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " #doc_id_start_id= 5\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")\n" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input files before exact dedupe : 5\n", - "Output files after exact dedupe : 4\n", - "Near duplicate files removed : 1\n", - "Displaying contents of : output/04_fuzzy_dedupe_out\n" - ] + "cell_type": "markdown", + "id": "4bd6f382", + "metadata": { + "id": "4bd6f382" + }, + "source": [ + "### 5.2 - Inspect Generated output\n", + "\n", + "You would see a new columns **doc_hash** and **int_id_column**" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-21T23:44:04.0670750.636751lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx102518a2e39-5c85-400f-8864-6bbc3ef20b1e10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-21T23:44:05.3207660.619056spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101137f9901a-f0b3-49c5-b5cd-dbfeb0126cd47758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-21T23:44:04.7000380.629441mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011b895a249-e72d-4096-85fa-e0606d61aebf14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-21T23:44:01.9171040.993879earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
\n", - "
" + "cell_type": "code", + "execution_count": 12, + "id": "f3d4aba9", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 557 + }, + "id": "f3d4aba9", + "outputId": "b4b868b3-ebc7-48a2-f0c5-b0b023a24238" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/02_docid_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_column
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102a8502d17-692c-4c88-a2a4-19a19ba078926571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-22T22:16:14.0714530.706817lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3
1spam.pdfFree xxx10208f28dfa-e607-4c47-b9d6-66b7b8c193b710026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-22T22:16:15.3721960.631735spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10111e279ae8-df6a-4b07-8500-6f0a564f352b10729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-22T22:16:13.3630450.796537earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10113685b6ed-bd33-49f3-95a5-806d28e8311b7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-22T22:16:14.7388180.665504mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115a070315-684b-481f-9c9f-76903a720d4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-22T22:16:11.7239871.010865earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", + "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", + "2 1e279ae8-df6a-4b07-8500-6f0a564f352b 10729312978404042321 pdf \n", + "3 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", + "4 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", + "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", + "2 2025-01-22T22:16:13.363045 0.796537 earth2.pdf \n", + "3 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", + "4 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " filename contents \\\n", - "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", - "1 spam.pdf Free xxx \n", - "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "\n", - " num_pages num_tables num_doc_elements \\\n", - "0 1 0 2 \n", - "1 1 0 2 \n", - "2 1 0 11 \n", - "3 1 0 11 \n", - "\n", - " document_id document_hash ext \\\n", - "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", - "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", - "2 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", - "3 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", - "\n", - " hash size \\\n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-21T23:44:04.067075 0.636751 lorem-ipsum.pdf \n", - "1 2025-01-21T23:44:05.320766 0.619056 spam.pdf \n", - "2 2025-01-21T23:44:04.700038 0.629441 mars.pdf \n", - "3 2025-01-21T23:44:01.917104 0.993879 earth-copy.pdf \n", - "\n", - " doc_hash int_id_column removed \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " + "source": [ + "print (\"Displaying contents of : \", output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df.head()" ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "input_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", - "output_df = read_parquet_files_as_df(os.path.join(output_fuzzy_dedupe_dir, \"cleaned\"))\n", - "\n", - "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", - "output_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "3e0598a0", - "metadata": {}, - "source": [ - "## Step-8: Document Quality\n", - "\n", - "This handy plugin will score documents across many metrics.\n", - "\n", - "Here we will look for 'bad words' metric.\n", - "\n", - "[Document quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" - ] - }, - { - "cell_type": "markdown", - "id": "1949c2c4", - "metadata": {}, - "source": [ - "### 8.1 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "b485f598", - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-5: Processing input='output/04_fuzzy_dedupe_out/cleaned' --> output='output/05_doc_quality_out'\n", - "\n" - ] + "cell_type": "markdown", + "id": "c55f8d3f", + "metadata": { + "id": "c55f8d3f" + }, + "source": [ + "## Step-6: Eliminate Duplicate Documents\n", + "\n", + "We have 2 exact duplicates: **earth.pdf** , **earth-copy.pdf**\n", + "\n", + "Note how **doc_hash** for these documents are the same.\n", + "\n", + "[Exact dedupe information](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/ededup)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "23:44:05 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "23:44:05 INFO - data factory docq_ is using local configuration without input/output path\n", - "23:44:05 INFO - data factory docq_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - pipeline id pipeline_id\n", - "23:44:05 INFO - code location None\n", - "23:44:05 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "23:44:05 INFO - data factory data_ max_files -1, n_sample -1\n", - "23:44:05 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "23:44:05 INFO - orchestrator docq started at 2025-01-21 23:44:05\n", - "23:44:05 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", - "23:44:05 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-2-pdf-processing/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "23:44:05 INFO - Completed 1 files (20.0%) in 0.0 min\n", - "23:44:05 WARNING - table is empty, skipping processing\n", - "23:44:05 INFO - Completed 2 files (40.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 3 files (60.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 4 files (80.0%) in 0.0 min\n", - "23:44:05 INFO - Completed 5 files (100.0%) in 0.0 min\n", - "23:44:05 INFO - Done processing 5 files, waiting for flush() completion.\n", - "23:44:05 INFO - done flushing in 0.0 sec\n", - "23:44:05 INFO - Completed execution in 0.0 min, execution result 0\n" - ] + "cell_type": "markdown", + "id": "6f5ef1f7", + "metadata": { + "id": "6f5ef1f7" + }, + "source": [ + "### 6.1 - Execute" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:5 completed successfully\n", - "CPU times: user 31.5 ms, sys: 2.81 ms, total: 34.3 ms\n", - "Wall time: 28.4 ms\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "from dpk_doc_quality.transform_python import DocQuality\n", - "\n", - "STAGE = 5\n", - "output_fuzzy_dedupe_cleaned_dir = os.path.join(output_fuzzy_dedupe_dir, \"cleaned\")\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_fuzzy_dedupe_cleaned_dir}' --> output='{output_doc_quality_dir}'\\n\", flush=True)\n", - "\n", - "result = DocQuality(input_folder=output_fuzzy_dedupe_cleaned_dir,\n", - " output_folder= output_doc_quality_dir,\n", - " docq_text_lang = \"en\",\n", - " docq_doc_content_column =\"contents\",\n", - " ).transform()\n", - "\n", - "if result == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" - ] - }, - { - "cell_type": "markdown", - "id": "eccefd3e", - "metadata": {}, - "source": [ - "### 8.2 - Inspect the Output\n", - "\n", - "We will see several new columns starting with the name **docq_**.\n", - "\n", - "We will look at a metric **docq_contain_bad_word** and filter out any documents that have bad words.\n", - "\n", - "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "1f3225f8", - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 13, + "id": "90eddb4c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "90eddb4c", + "outputId": "61221177-f23e-4daa-8e34-237582fc19b0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-3: Processing input='output/02_docid_out' --> output='output/03_exact_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22:16:15 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator ededup started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "22:16:15 INFO - Starting from the beginning\n", + "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:16:15 INFO - done flushing in 0.0 sec\n", + "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:3 completed successfully\n", + "CPU times: user 25.2 ms, sys: 4.28 ms, total: 29.5 ms\n", + "Wall time: 23 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_ededup.transform_python import Ededup\n", + "\n", + "STAGE = 3\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Ededup(input_folder=output_docid_dir,\n", + " output_folder=output_exact_dedupe_dir,\n", + " ededup_doc_column=\"contents\",\n", + " ededup_doc_id_column=\"doc_hash\"\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Displaying contents of : output/05_doc_quality_out\n" - ] + "cell_type": "markdown", + "id": "f4aacf09", + "metadata": { + "id": "f4aacf09" + }, + "source": [ + "### 6.2 - Inspect Generated output\n", + "\n", + "You can see one of **earth.pdf** or **earth-copy.pdf** will be eliminated." + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b6571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...35...5.0000000.00000010.0857140.0False0.0000000.01.000000False
1spam.pdfFree xxx102518a2e39-5c85-400f-8864-6bbc3ef20b1e10026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...8...3.5000000.00000010.0000000.0True0.0000000.01.000000False
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101137f9901a-f0b3-49c5-b5cd-dbfeb0126cd47758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.0000000.0False0.1764710.00.880000True
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011b895a249-e72d-4096-85fa-e0606d61aebf14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.0000000.0False0.1764710.00.880734True
\n", - "

4 rows × 27 columns

\n", - "
" + "cell_type": "code", + "execution_count": 14, + "id": "1887b26d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 611 + }, + "id": "1887b26d", + "outputId": "31210411-1abd-418a-c1d9-167770788d62" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 6\n", + "Output files after exact dedupe : 5\n", + "Duplicate files removed : 1\n", + "Displaying contents of : output/03_exact_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102a8502d17-692c-4c88-a2a4-19a19ba078926571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-22T22:16:14.0714530.706817lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx10208f28dfa-e607-4c47-b9d6-66b7b8c193b710026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-22T22:16:15.3721960.631735spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2earth2.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10111e279ae8-df6a-4b07-8500-6f0a564f352b10729312978404042321pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...6102025-01-22T22:16:13.3630450.796537earth2.pdff039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...2[]
3mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10113685b6ed-bd33-49f3-95a5-806d28e8311b7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-22T22:16:14.7388180.665504mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
4earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115a070315-684b-481f-9c9f-76903a720d4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-22T22:16:11.7239871.010865earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", + "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", + "2 1e279ae8-df6a-4b07-8500-6f0a564f352b 10729312978404042321 pdf \n", + "3 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", + "4 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", + "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", + "2 2025-01-22T22:16:13.363045 0.796537 earth2.pdf \n", + "3 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", + "4 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " filename contents \\\n", - "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", - "1 spam.pdf Free xxx \n", - "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "\n", - " num_pages num_tables num_doc_elements \\\n", - "0 1 0 2 \n", - "1 1 0 2 \n", - "2 1 0 11 \n", - "3 1 0 11 \n", - "\n", - " document_id document_hash ext \\\n", - "0 ee1fe28d-fb19-4456-83ac-42a9c7ed2c7b 6571294142213095721 pdf \n", - "1 518a2e39-5c85-400f-8864-6bbc3ef20b1e 10026122586747302274 pdf \n", - "2 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", - "3 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", - "\n", - " hash size ... \\\n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", - "\n", - " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", - "0 5.000000 0.000000 1 \n", - "1 3.500000 0.000000 1 \n", - "2 4.688000 0.032000 8 \n", - "3 4.541284 0.027523 9 \n", - "\n", - " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", - "0 0.085714 0.0 False \n", - "1 0.000000 0.0 True \n", - "2 0.000000 0.0 False \n", - "3 0.000000 0.0 False \n", - "\n", - " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", - "0 0.000000 0.0 \n", - "1 0.000000 0.0 \n", - "2 0.176471 0.0 \n", - "3 0.176471 0.0 \n", - "\n", - " docq_alphabet_word_ratio docq_contain_common_en_words \n", - "0 1.000000 False \n", - "1 1.000000 False \n", - "2 0.880000 True \n", - "3 0.880734 True \n", - "\n", - "[4 rows x 27 columns]" + "source": [ + "input_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", + "output_df.head()" ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df = read_parquet_files_as_df(output_doc_quality_dir)\n", - "print (\"Displaying contents of : \", output_doc_quality_dir)\n", - "output_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "02fa3bd2", - "metadata": {}, - "source": [ - "### 8.3 - Filtering 'quality' documents\n", - "\n", - "So from the output above we see **spam.pdf** is flagged for containing bad words (see column **docq_contain_bad_word**).\n", - "\n", - "Also **lorem.pdf** is flagged for place holder content **lorem ipsum**\n", - "\n", - "We are going to filter them both out" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "5dac1c70", - "metadata": {}, - "outputs": [ + }, + { + "cell_type": "markdown", + "id": "76ea34e2", + "metadata": { + "id": "76ea34e2" + }, + "source": [ + "## Step-7: Fuzzy Dedupe\n", + "\n", + "In previous step, we removed **exact duplicates (identical documents)**.\n", + "\n", + "Fuzzy de-dupe can further filter out documents that are **not exactly identical, but nearly identical**\n", + "\n", + "Here is a simple example:\n", + "\n", + "`Our solar system is a vast and fascinating expanse`\n", + "\n", + "`The solar system is a vast and fascinating expanse`\n", + "\n", + "Only one word is different `Our` vs `The`.\n", + "\n", + "Imagine two documents with one extra blank line. For our purposes they are the same.\n", + "\n", + "[Fuzzy dedupe documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/fdedup)\n", + "\n", + "### Tweaking fuzzy matches\n", + "\n", + "**`jaccard_similarity_threshold`** is the parameter used to tweak similarities between documents. It's value is between 0 and 1.0. Values close to 1.0 means more strict checking (fewer documents will qualify). Lower threshold means more leniant matches (more documents will qualify)\n", + "\n", + "Adjust this value to find what works for your documents" + ] + }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...101137f9901a-f0b3-49c5-b5cd-dbfeb0126cd47758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.00.0False0.1764710.00.880000True
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...1011b895a249-e72d-4096-85fa-e0606d61aebf14711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.00.0False0.1764710.00.880734True
\n", - "

2 rows × 27 columns

\n", - "
" + "cell_type": "markdown", + "id": "79a37713", + "metadata": { + "id": "79a37713" + }, + "source": [ + "### 7.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "37430b60", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "37430b60", + "outputId": "48366a20-f5c2-4040-bf56-8b29ce40ed53" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-4: Processing input='output/03_exact_dedupe_out' --> output='output/04_fuzzy_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22:16:15 INFO - Starting SignatureCalculation step\n", + "22:16:15 INFO - Got parameters for SignatureCalculation\n", + "22:16:15 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "22:16:15 INFO - data factory scdata_ is using local configuration without input/output path\n", + "22:16:15 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator minhash started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:16:15 WARNING - table is empty, skipping processing\n", + "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:16:15 INFO - Starting flush()\n", + "22:16:15 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "22:16:15 INFO - done flushing in 0.024 sec\n", + "22:16:15 INFO - Completed execution in 0.001 min, execution result 0\n", + "22:16:15 INFO - SignatureCalculation completed successfully\n", + "22:16:15 INFO - Starting ClusterAnalysis step\n", + "22:16:15 INFO - Got parameters for ClusterAnalysis\n", + "22:16:15 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator cluster started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of folders is 14\n", + "22:16:15 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "22:16:15 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "22:16:15 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "22:16:15 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "22:16:15 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "22:16:15 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "22:16:15 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 8 files (57.14%) in 0.0 min\n", + "22:16:15 INFO - Completed 9 files (64.29%) in 0.0 min\n", + "22:16:15 INFO - Completed 10 files (71.43%) in 0.0 min\n", + "22:16:15 INFO - Completed 11 files (78.57%) in 0.0 min\n", + "22:16:15 INFO - Completed 12 files (85.71%) in 0.0 min\n", + "22:16:15 INFO - Completed 13 files (92.86%) in 0.0 min\n", + "22:16:15 INFO - Completed 14 files (100.0%) in 0.001 min\n", + "22:16:15 INFO - Done processing 14 files, waiting for flush() completion.\n", + "22:16:15 INFO - done flushing in 0.0 sec\n", + "22:16:15 INFO - Completed execution in 0.001 min, execution result 0\n", + "22:16:15 INFO - ClusterAnalysis completed successfully\n", + "22:16:15 INFO - Starting GetDuplicateList step\n", + "22:16:15 INFO - Got parameters for GetDuplicateList\n", + "22:16:15 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator fdlist started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of folders is 1\n", + "22:16:15 INFO - Get Duplicate List for folder docs_to_remove\n", + "22:16:15 INFO - 1 documents marked as duplicates\n", + "22:16:15 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "22:16:15 INFO - Done processing 1 files, waiting for flush() completion.\n", + "22:16:15 INFO - done flushing in 0.0 sec\n", + "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n", + "22:16:15 INFO - GetDuplicateList completed successfully\n", + "22:16:15 INFO - Starting DataCleaning step\n", + "22:16:15 INFO - Got parameters for DataCleaning\n", + "22:16:15 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "22:16:15 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "22:16:15 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator fdclean started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:16:15 WARNING - table is empty, skipping processing\n", + "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:16:15 INFO - done flushing in 0.0 sec\n", + "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n", + "22:16:15 INFO - DataCleaning completed successfully\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 237 ms, sys: 82.4 ms, total: 320 ms\n", + "Wall time: 209 ms\n" + ] + } ], - "text/plain": [ - " filename contents \\\n", - "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", - "\n", - " num_pages num_tables num_doc_elements \\\n", - "2 1 0 11 \n", - "3 1 0 11 \n", - "\n", - " document_id document_hash ext \\\n", - "2 37f9901a-f0b3-49c5-b5cd-dbfeb0126cd4 7758129997476962679 pdf \n", - "3 b895a249-e72d-4096-85fa-e0606d61aebf 14711865278795535908 pdf \n", - "\n", - " hash size ... \\\n", - "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", - "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", - "\n", - " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", - "2 4.688000 0.032000 8 \n", - "3 4.541284 0.027523 9 \n", - "\n", - " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", - "2 0.0 0.0 False \n", - "3 0.0 0.0 False \n", - "\n", - " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", - "2 0.176471 0.0 \n", - "3 0.176471 0.0 \n", - "\n", - " docq_alphabet_word_ratio docq_contain_common_en_words \n", - "2 0.880000 True \n", - "3 0.880734 True \n", - "\n", - "[2 rows x 27 columns]" + "source": [ + "%%time\n", + "\n", + "from dpk_fdedup.transform_python import Fdedup\n", + "\n", + "STAGE = 4\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_exact_dedupe_dir}' --> output='{output_fuzzy_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Fdedup(input_folder=output_exact_dedupe_dir,\n", + " output_folder=output_fuzzy_dedupe_dir,\n", + " contents_column= \"contents\",\n", + " # document_id_column= \"doc_id\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " jaccard_similarity_threshold = 0.8, # between 0 - 1. higher means more strict checking\n", + " operation_mode=\"filter_duplicates\",\n", + " # operation_mode=\"annotate\",\n", + " ).transform()\n", + "# if result == 0:\n", + "# print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "# else:\n", + "# raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", - "\n", - "# remove documents with badwords\n", - "clean_docs_df = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", - "\n", - "# also filter out 'lorem ipsum' text\n", - "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", - "\n", - "clean_docs_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "31f09b58-7b2d-48bb-9dac-bc0ba9625c01" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Saved parquet output to 'output/output_final/pq'\n" - ] - } - ], - "source": [ - "import shutil\n", - "\n", - "shutil.rmtree(output_final_dir, ignore_errors=True)\n", - "shutil.os.makedirs(output_final_dir, exist_ok=True)\n", - "\n", - "output_final_dir_parquet = os.path.join (output_final_dir, 'pq')\n", - "shutil.os.makedirs(output_final_dir_parquet, exist_ok=True)\n", - "\n", - "output_final_dir_markdown = os.path.join (output_final_dir, 'markdown')\n", - "shutil.os.makedirs(output_final_dir_markdown, exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "e06ce4f2", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "b2c83592", + "metadata": { + "id": "b2c83592" + }, + "source": [ + "### 7.2 - Inspect Output\n", + "\n", + "FuzzyDedupe will write documents that are filtered in **output/04_fuzzy_dedupe_out/cleaned** folder\n", + "\n", + "You will notice only one **earth.pdf** made it! So fuzzy dedupe did filter out the almost identical doc." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Saved CLEAN parquet output to 'output/output_final/pq'\n" - ] - } - ], - "source": [ - "## save parquet\n", - "\n", - "clean_docs_df.to_parquet(os.path.join(output_final_dir_parquet, \"clean_docs.parquet\"))\n", - "print (f\"✅ Saved CLEAN parquet output to '{output_final_dir_parquet}'\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "1e175302", - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 16, + "id": "573faba2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 511 + }, + "id": "573faba2", + "outputId": "49408c6e-a22b-404f-ccc5-c00edb7ce85a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 5\n", + "Output files after exact dedupe : 4\n", + "Near duplicate files removed : 1\n", + "Displaying contents of : output/04_fuzzy_dedupe_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsizedate_acquiredpdf_convert_timesource_filenamedoc_hashint_id_columnremoved
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102a8502d17-692c-4c88-a2a4-19a19ba078926571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...352025-01-22T22:16:14.0714530.706817lorem-ipsum.pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...3[]
1spam.pdfFree xxx10208f28dfa-e607-4c47-b9d6-66b7b8c193b710026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...82025-01-22T22:16:15.3721960.631735spam.pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...5[]
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10113685b6ed-bd33-49f3-95a5-806d28e8311b7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...7172025-01-22T22:16:14.7388180.665504mars.pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...4[]
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115a070315-684b-481f-9c9f-76903a720d4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...6102025-01-22T22:16:11.7239871.010865earth-copy.pdf6140cf695f269a3ddca6568536076756105ad3186086b2...0[]
\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", + "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", + "2 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", + "3 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", + "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", + "2 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", + "3 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "output_df = read_parquet_files_as_df(os.path.join(output_fuzzy_dedupe_dir, \"cleaned\"))\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", + "output_df.head()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Saved CLEAN markdown output to 'output/output_final/markdown'\n" - ] - } - ], - "source": [ - "## save markdown text\n", - "\n", - "for index, row in clean_docs_df.iterrows():\n", - " output_file_name = os.path.join (output_final_dir_markdown, row['filename'] + '.md')\n", - " with open(output_file_name, 'w') as output_file:\n", - " output_file.write(row['contents'])\n", - " \n", - "print (f\"✅ Saved CLEAN markdown output to '{output_final_dir_markdown}'\")\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "dpk-2-pdf-processing", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "06f9b33494984e4885d5aad813d1d2bc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } + "cell_type": "markdown", + "id": "3e0598a0", + "metadata": { + "id": "3e0598a0" + }, + "source": [ + "## Step-8: Document Quality\n", + "\n", + "This handy plugin will score documents across many metrics.\n", + "\n", + "Here we will look for 'bad words' metric.\n", + "\n", + "[Document quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)\n", + "\n", + "By default it uses [bad words collection](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality/dpk_doc_quality/ldnoobw). You can supply a custom file by passing an argument `bad_word_filepath=/path/to/badwords_file`" + ] }, - "1cb3bbf7d724411cbe9831543a4aecc0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + { + "cell_type": "markdown", + "id": "1949c2c4", + "metadata": { + "id": "1949c2c4" + }, + "source": [ + "### 8.1 - Execute" + ] }, - "553f3c16839a49d79591d0fc4862bed6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + { + "cell_type": "code", + "execution_count": 17, + "id": "b485f598", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b485f598", + "outputId": "448a8ee1-9371-4bd4-f5ad-a596893fe65f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-5: Processing input='output/04_fuzzy_dedupe_out/cleaned' --> output='output/05_doc_quality_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22:16:15 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-3-pdf-processing-r1.0.0-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "22:16:15 INFO - data factory docq_ is using local configuration without input/output path\n", + "22:16:15 INFO - data factory docq_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - pipeline id pipeline_id\n", + "22:16:15 INFO - code location None\n", + "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:16:15 INFO - orchestrator docq started at 2025-01-22 22:16:15\n", + "22:16:15 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "22:16:15 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-3-pdf-processing-r1.0.0-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "22:16:15 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "22:16:15 WARNING - table is empty, skipping processing\n", + "22:16:15 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "22:16:15 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "22:16:15 INFO - Done processing 5 files, waiting for flush() completion.\n", + "22:16:15 INFO - done flushing in 0.0 sec\n", + "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:5 completed successfully\n", + "CPU times: user 36.1 ms, sys: 2.93 ms, total: 39 ms\n", + "Wall time: 33.4 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from dpk_doc_quality.transform_python import DocQuality\n", + "\n", + "STAGE = 5\n", + "output_fuzzy_dedupe_cleaned_dir = os.path.join(output_fuzzy_dedupe_dir, \"cleaned\")\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_fuzzy_dedupe_cleaned_dir}' --> output='{output_doc_quality_dir}'\\n\", flush=True)\n", + "\n", + "result = DocQuality(input_folder=output_fuzzy_dedupe_cleaned_dir,\n", + " output_folder= output_doc_quality_dir,\n", + " docq_text_lang = \"en\",\n", + " docq_doc_content_column =\"contents\",\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" + ] + }, + { + "cell_type": "markdown", + "id": "eccefd3e", + "metadata": { + "id": "eccefd3e" + }, + "source": [ + "### 8.2 - Inspect the Output\n", + "\n", + "We will see several new columns starting with the name **docq_**.\n", + "\n", + "We will look at a metric **docq_contain_bad_word** and filter out any documents that have bad words.\n", + "\n", + "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" + ] }, - "7053c9606a414e978636a7e241909504": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1cb3bbf7d724411cbe9831543a4aecc0", - "placeholder": "​", - "style": "IPY_MODEL_06f9b33494984e4885d5aad813d1d2bc", - "value": " 10/10 [00:00<00:00, 349.38it/s]" - } + { + "cell_type": "code", + "execution_count": 18, + "id": "1f3225f8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 485 + }, + "id": "1f3225f8", + "outputId": "a6009dc0-6ca6-411a-8066-090c610860e0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Displaying contents of : output/05_doc_quality_out\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
0lorem-ipsum.pdfLorem ipsum Lorem ipsum Lorem ipsum102a8502d17-692c-4c88-a2a4-19a19ba078926571294142213095721pdfbc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...35...5.0000000.00000010.0857140.0False0.0000000.01.000000False
1spam.pdfFree xxx10208f28dfa-e607-4c47-b9d6-66b7b8c193b710026122586747302274pdf543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...8...3.5000000.00000010.0000000.0True0.0000000.01.000000False
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10113685b6ed-bd33-49f3-95a5-806d28e8311b7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.0000000.0False0.1764710.00.880000True
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115a070315-684b-481f-9c9f-76903a720d4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.0000000.0False0.1764710.00.880734True
\n", + "

4 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", + "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", + "2 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", + "3 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "0 5.000000 0.000000 1 \n", + "1 3.500000 0.000000 1 \n", + "2 4.688000 0.032000 8 \n", + "3 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "0 0.085714 0.0 False \n", + "1 0.000000 0.0 True \n", + "2 0.000000 0.0 False \n", + "3 0.000000 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "0 0.000000 0.0 \n", + "1 0.000000 0.0 \n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "0 1.000000 False \n", + "1 1.000000 False \n", + "2 0.880000 True \n", + "3 0.880734 True \n", + "\n", + "[4 rows x 27 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "print (\"Displaying contents of : \", output_doc_quality_dir)\n", + "output_df.head()" + ] }, - "724778729161445c98b187031ae4f67c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } + { + "cell_type": "markdown", + "id": "02fa3bd2", + "metadata": { + "id": "02fa3bd2" + }, + "source": [ + "### 8.3 - Filtering 'quality' documents\n", + "\n", + "So from the output above we see **spam.pdf** is flagged for containing bad words (**docq_contain_bad_word=True**).\n", + "\n", + "Also **lorem.pdf** is flagged for place holder content **lorem ipsum** (**docq_lorem_ipsum_ratio > 0**)\n", + "\n", + "We are going to filter them both out" + ] }, - "97b603697cfa4b4ea4e6735b6768ca35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e87e8d3262c54cfaaa8768505edacda3", - "IPY_MODEL_b78aa40816e44f7fbebcb24ca68818b3", - "IPY_MODEL_7053c9606a414e978636a7e241909504" + { + "cell_type": "code", + "execution_count": 19, + "id": "5dac1c70", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "5dac1c70", + "outputId": "463e897f-1099-410a-f753-34c4846228c3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_iddocument_hashexthashsize...docq_mean_word_lendocq_symbol_to_word_ratiodocq_sentence_countdocq_lorem_ipsum_ratiodocq_curly_bracket_ratiodocq_contain_bad_worddocq_bullet_point_ratiodocq_ellipsis_line_ratiodocq_alphabet_word_ratiodocq_contain_common_en_words
2mars.pdf## Mars\\n\\n## Solar System\\n\\nOur solar system...10113685b6ed-bd33-49f3-95a5-806d28e8311b7758129997476962679pdfa3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...717...4.6880000.03200080.00.0False0.1764710.00.880000True
3earth-copy.pdf## Earth\\n\\n## Solar System\\n\\nOur solar syste...10115a070315-684b-481f-9c9f-76903a720d4414711865278795535908pdf6140cf695f269a3ddca6568536076756105ad3186086b2...610...4.5412840.02752390.00.0False0.1764710.00.880734True
\n", + "

2 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " filename contents \\\n", + "2 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "3 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "2 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", + "3 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "2 4.688000 0.032000 8 \n", + "3 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "2 0.0 0.0 False \n", + "3 0.0 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "2 0.880000 True \n", + "3 0.880734 True \n", + "\n", + "[2 rows x 27 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } ], - "layout": "IPY_MODEL_da0787b239764847a731083997780a85" - } + "source": [ + "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "\n", + "# remove documents with badwords\n", + "clean_docs_df = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", + "\n", + "# also filter out 'lorem ipsum' text\n", + "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", + "\n", + "clean_docs_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": { + "id": "f5e12630-be6b-4188-a925-77117155617b" + }, + "source": [ + "## Step-9: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": { + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207" + }, + "outputs": [], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(output_final_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_final_dir, exist_ok=True)\n", + "\n", + "output_final_dir_parquet = os.path.join (output_final_dir, 'pq')\n", + "shutil.os.makedirs(output_final_dir_parquet, exist_ok=True)\n", + "\n", + "output_final_dir_markdown = os.path.join (output_final_dir, 'markdown')\n", + "shutil.os.makedirs(output_final_dir_markdown, exist_ok=True)" + ] }, - "9d184ed175f0403fb03c2e13dfd04e0a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + { + "cell_type": "code", + "execution_count": 21, + "id": "e06ce4f2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e06ce4f2", + "outputId": "8a26e407-2cc8-44ee-ba6b-ca6485a92926" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Saved CLEAN parquet output to 'output/output_final/pq'\n" + ] + } + ], + "source": [ + "## save parquet\n", + "\n", + "clean_docs_df.to_parquet(os.path.join(output_final_dir_parquet, \"clean_docs.parquet\"))\n", + "print (f\"✅ Saved CLEAN parquet output to '{output_final_dir_parquet}'\")" + ] }, - "b78aa40816e44f7fbebcb24ca68818b3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9d184ed175f0403fb03c2e13dfd04e0a", - "max": 10, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_724778729161445c98b187031ae4f67c", - "value": 10 - } + { + "cell_type": "code", + "execution_count": 22, + "id": "1e175302", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1e175302", + "outputId": "d54c5d80-23ce-49a6-e098-8e712d048975" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Saved CLEAN markdown output to 'output/output_final/markdown'\n" + ] + } + ], + "source": [ + "## save markdown text\n", + "\n", + "for index, row in clean_docs_df.iterrows():\n", + " output_file_name = os.path.join (output_final_dir_markdown, row['filename'] + '.md')\n", + " with open(output_file_name, 'w') as output_file:\n", + " output_file.write(row['contents'])\n", + "\n", + "print (f\"✅ Saved CLEAN markdown output to '{output_final_dir_markdown}'\")\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] }, - "c0eb5bc8f6ee427ca42204b3c56f9a4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } + "kernelspec": { + "display_name": "dpk-3-pdf-processing-r1.0.0-py3.11", + "language": "python", + "name": "python3" }, - "da0787b239764847a731083997780a85": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" }, - "e87e8d3262c54cfaaa8768505edacda3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_553f3c16839a49d79591d0fc4862bed6", - "placeholder": "​", - "style": "IPY_MODEL_c0eb5bc8f6ee427ca42204b3c56f9a4e", - "value": "Fetching 10 files: 100%" - } + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1ce164863aa34f64a94aeb5d05103043": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "257dbf0b62624667b0c82afaf1c8ccf1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb81f32569c34250b901235698e5ea18", + "placeholder": "​", + "style": "IPY_MODEL_1ce164863aa34f64a94aeb5d05103043", + "value": "Fetching 9 files: 100%" + } + }, + "4e76bef9228546fd97cccfe7bdd856f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e2b5f84c30de45d29588a07a3d106eb4", + "max": 9, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cc7d3125eb55461180566d1064eeb2a5", + "value": 9 + } + }, + "55b9873ce1f34c169ecc6087c3cd65a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "68eb811a52804887bc383e89a72a0975": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c0c37c0262b84e9ebf02c1ce17f263ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68eb811a52804887bc383e89a72a0975", + "placeholder": "​", + "style": "IPY_MODEL_55b9873ce1f34c169ecc6087c3cd65a1", + "value": " 9/9 [00:00<00:00, 220.49it/s]" + } + }, + "ca821137125b45d08e257f95822a6f72": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc7d3125eb55461180566d1064eeb2a5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "df5c199339f5467a91453fa187e201f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_257dbf0b62624667b0c82afaf1c8ccf1", + "IPY_MODEL_4e76bef9228546fd97cccfe7bdd856f3", + "IPY_MODEL_c0c37c0262b84e9ebf02c1ce17f263ee" + ], + "layout": "IPY_MODEL_ca821137125b45d08e257f95822a6f72" + } + }, + "e2b5f84c30de45d29588a07a3d106eb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fb81f32569c34250b901235698e5ea18": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } } - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 + }, + "nbformat": 4, + "nbformat_minor": 5 } From 9a7c8305f1cff57ebd221646483236bfdac8c629 Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Wed, 29 Jan 2025 11:34:00 -0800 Subject: [PATCH 4/6] Updated pdf-processing-example - Updated RAY version to newer/simpler API - Added a troubleshooting section to README - Misc cleanups Signed-off-by: Sujee Maniyam --- examples/notebooks/pdf-processing-1/README.md | 20 +- .../pdf_processing_1_python.ipynb | 765 +-- .../pdf_processing_1_ray.ipynb | 4623 ++++++----------- .../pdf-processing-1/requirements.txt | 6 + 4 files changed, 2020 insertions(+), 3394 deletions(-) create mode 100644 examples/notebooks/pdf-processing-1/requirements.txt diff --git a/examples/notebooks/pdf-processing-1/README.md b/examples/notebooks/pdf-processing-1/README.md index dc63ecd34..70337476f 100644 --- a/examples/notebooks/pdf-processing-1/README.md +++ b/examples/notebooks/pdf-processing-1/README.md @@ -24,12 +24,8 @@ conda create -n data-prep-kit -y python=3.11 conda activate data-prep-kit # install the following in 'data-prep-kit' environment -pip3 install 'data-prep-toolkit-transforms[ray,all]==1.0.0' -pip3 install jupyterlab ipykernel ipywidgets - -## install custom kernel -## Important: Use this kernel when running example notebooks! -python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit" +cd examples/notebooks/pdf-processing-1 +pip3 install -r requirements.txt # start jupyter and run the notebooks with this jupyter jupyter lab @@ -42,6 +38,15 @@ jupyter lab [ray version](pdf_processing_1_ray.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb) +## Troubleshooting + +If you encounter any errors loading libraries, try creating a custom kernel and using it to run the notebooks. + +```bash +python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit" +# and select this kernel within jupyter notebook +``` + ## Creating Input PDFs (Optional) @@ -55,4 +60,5 @@ pandoc earth2.md -o earth2.pdf pandoc mars.md -o mars.pdf pandoc spam.md -o spam.pdf pandoc lorem-ipsum.md -o lorem-ipsum.pdf -``` \ No newline at end of file +``` + diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb index 30ba3316c..90a09cfe6 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb @@ -216,7 +216,7 @@ "shutil.os.makedirs(input_dir, exist_ok=True)\n", "output_dir = \"output\"\n", "\n", - "output_text_dir = os.path.join (output_dir, '01_text_out')\n", + "output_pdf2pq_dir = os.path.join (output_dir, '01_pdf2pq_out')\n", "output_docid_dir = os.path.join (output_dir, '02_docid_out')\n", "output_exact_dedupe_dir = os.path.join (output_dir, '03_exact_dedupe_out')\n", "output_fuzzy_dedupe_dir = os.path.join (output_dir, '04_fuzzy_dedupe_out')\n", @@ -325,16 +325,6 @@ "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" ] }, - { - "cell_type": "code", - "execution_count": 27, - "id": "eG_5od2HjQWG", - "metadata": { - "id": "eG_5od2HjQWG" - }, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "id": "7113b16c", @@ -441,7 +431,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_text_out'\n", + "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_pdf2pq_out'\n", "\n" ] }, @@ -449,21 +439,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:15:10 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "22:15:10 INFO - pipeline id pipeline_id\n", - "22:15:10 INFO - code location None\n", - "22:15:10 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_text_out\n", - "22:15:10 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:15:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "22:15:10 INFO - orchestrator pdf2parquet started at 2025-01-22 22:15:10\n", - "22:15:10 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "22:15:10 INFO - Initializing models\n" + "11:27:11 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "11:27:11 INFO - pipeline id pipeline_id\n", + "11:27:11 INFO - code location None\n", + "11:27:11 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_pdf2pq_out\n", + "11:27:11 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:11 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "11:27:11 INFO - orchestrator pdf2parquet started at 2025-01-29 11:27:11\n", + "11:27:11 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "11:27:11 INFO - Initializing models\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4b24e260fadc41a3a56914a06dd1f568", + "model_id": "1e7c9a2ba2a841a0b84db84b3d60974e", "version_major": 2, "version_minor": 0 }, @@ -478,15 +468,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:16:11 INFO - Completed 1 files (16.67%) in 0.017 min\n", - "22:16:12 INFO - Completed 2 files (33.33%) in 0.031 min\n", - "22:16:13 INFO - Completed 3 files (50.0%) in 0.044 min\n", - "22:16:14 INFO - Completed 4 files (66.67%) in 0.056 min\n", - "22:16:14 INFO - Completed 5 files (83.33%) in 0.067 min\n", - "22:16:15 INFO - Completed 6 files (100.0%) in 0.078 min\n", - "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 1.08 min, execution result 0\n" + "11:27:18 INFO - Completed 1 files (16.67%) in 0.019 min\n", + "11:27:19 INFO - Completed 2 files (33.33%) in 0.034 min\n", + "11:27:19 INFO - Completed 3 files (50.0%) in 0.045 min\n", + "11:27:20 INFO - Completed 4 files (66.67%) in 0.055 min\n", + "11:27:21 INFO - Completed 5 files (83.33%) in 0.066 min\n", + "11:27:21 INFO - Completed 6 files (100.0%) in 0.078 min\n", + "11:27:21 INFO - Done processing 6 files, waiting for flush() completion.\n", + "11:27:21 INFO - done flushing in 0.0 sec\n", + "11:27:21 INFO - Completed execution in 0.168 min, execution result 0\n" ] }, { @@ -494,8 +484,8 @@ "output_type": "stream", "text": [ "✅ Stage:1 completed successfully\n", - "CPU times: user 26 s, sys: 4.56 s, total: 30.6 s\n", - "Wall time: 1min 8s\n" + "CPU times: user 21.5 s, sys: 2.22 s, total: 23.7 s\n", + "Wall time: 14.1 s\n" ] } ], @@ -506,10 +496,10 @@ "from dpk_pdf2parquet.transform import pdf2parquet_contents_types\n", "\n", "STAGE = 1\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_dir}' --> output='{output_text_dir}'\\n\", flush=True)\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_dir}' --> output='{output_pdf2pq_dir}'\\n\", flush=True)\n", "\n", "result = Pdf2Parquet(input_folder= input_dir,\n", - " output_folder= output_text_dir,\n", + " output_folder= output_pdf2pq_dir,\n", " data_files_to_use=['.pdf'],\n", " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", " ).transform()\n", @@ -549,7 +539,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Displaying contents of : output/01_text_out\n" + "Displaying contents of : output/01_pdf2pq_out\n" ] }, { @@ -596,13 +586,13 @@ " 1\n", " 0\n", " 2\n", - " a8502d17-692c-4c88-a2a4-19a19ba07892\n", + " 5f1e1408-481a-4463-b7d1-7048da581607\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-22T22:16:14.071453\n", - " 0.706817\n", + " 2025-01-29T11:27:20.388498\n", + " 0.628371\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -612,13 +602,13 @@ " 1\n", " 0\n", " 2\n", - " 08f28dfa-e607-4c47-b9d6-66b7b8c193b7\n", + " dc0531e1-bc12-4919-8e27-13763592280e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-22T22:16:15.372196\n", - " 0.631735\n", + " 2025-01-29T11:27:21.754298\n", + " 0.730394\n", " spam.pdf\n", " \n", " \n", @@ -628,13 +618,13 @@ " 1\n", " 0\n", " 11\n", - " 1e279ae8-df6a-4b07-8500-6f0a564f352b\n", + " 74eaf9f3-716d-43c6-9cbc-b1497454d33b\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-22T22:16:13.363045\n", - " 0.796537\n", + " 2025-01-29T11:27:19.749728\n", + " 0.643720\n", " earth2.pdf\n", " \n", " \n", @@ -644,13 +634,13 @@ " 1\n", " 0\n", " 11\n", - " 3685b6ed-bd33-49f3-95a5-806d28e8311b\n", + " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-22T22:16:14.738818\n", - " 0.665504\n", + " 2025-01-29T11:27:21.013529\n", + " 0.614578\n", " mars.pdf\n", " \n", " \n", @@ -660,15 +650,31 @@ " 1\n", " 0\n", " 11\n", - " 5a070315-684b-481f-9c9f-76903a720d44\n", + " 523f8ace-a61e-4f27-9970-84581ba6626a\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-22T22:16:11.723987\n", - " 1.010865\n", + " 2025-01-29T11:27:18.200289\n", + " 1.119966\n", " earth-copy.pdf\n", " \n", + " \n", + " 5\n", + " earth.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " a41f26ae-fec2-43e6-8225-36bd944b4684\n", + " 14711865278795535908\n", + " pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:27:19.098169\n", + " 0.881966\n", + " earth.pdf\n", + " \n", " \n", "\n", "" @@ -680,6 +686,7 @@ "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", @@ -687,13 +694,15 @@ "2 1 0 11 \n", "3 1 0 11 \n", "4 1 0 11 \n", + "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", - "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", - "2 1e279ae8-df6a-4b07-8500-6f0a564f352b 10729312978404042321 pdf \n", - "3 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", - "4 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", + "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", + "2 74eaf9f3-716d-43c6-9cbc-b1497454d33b 10729312978404042321 pdf \n", + "3 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", + "4 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", + "5 a41f26ae-fec2-43e6-8225-36bd944b4684 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -701,13 +710,15 @@ "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", - "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", - "2 2025-01-22T22:16:13.363045 0.796537 earth2.pdf \n", - "3 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", - "4 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf " + "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", + "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", + "2 2025-01-29T11:27:19.749728 0.643720 earth2.pdf \n", + "3 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", + "4 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", + "5 2025-01-29T11:27:19.098169 0.881966 earth.pdf " ] }, "execution_count": 8, @@ -716,10 +727,10 @@ } ], "source": [ - "print (\"Displaying contents of : \", output_text_dir)\n", - "output_df = read_parquet_files_as_df(output_text_dir)\n", + "print (\"Displaying contents of : \", output_pdf2pq_dir)\n", + "output_df = read_parquet_files_as_df(output_pdf2pq_dir)\n", "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "output_df.head()\n", + "output_df.head(10)\n", "\n", "## To display certain columns\n", "#parquet_df[['column1', 'column2', 'column3']].head(5)" @@ -765,12 +776,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Lorem ipsum Lorem ipsum Lorem ipsum\n" + "## Earth\n", + "\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "\n", + "For more details about our Solar system see Chapter 1.\n", + "\n", + "## Earth\n", + "\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "\n", + "Basic facts about Earth:\n", + "\n", + "- · Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "- · Moons: One moon, called Luna or simply \"the Moon\".\n", + "- · Rotation Period: 24 hours (one day)\n" ] } ], "source": [ - "print (output_df.iloc[0, ]['contents'])" + "print (output_df[output_df['filename'] == 'earth.pdf'].iloc[0,]['contents'])" ] }, { @@ -794,7 +821,25 @@ } ], "source": [ - "print (output_df.iloc[1, ]['contents'])\n" + "print (output_df[output_df['filename'] == 'spam.pdf'].iloc[0,]['contents'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b37dd994", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lorem ipsum Lorem ipsum Lorem ipsum\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'lorem-ipsum.pdf'].iloc[0,]['contents'])" ] }, { @@ -828,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "cee20521", "metadata": { "colab": { @@ -842,7 +887,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-2: Processing input='output/01_text_out' --> output='output/02_docid_out'\n", + "🏃🏼 STAGE-2: Processing input='output/01_pdf2pq_out' --> output='output/02_docid_out'\n", "\n" ] }, @@ -850,23 +895,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:16:15 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/01_text_out output_folder - output/02_docid_out\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator doc_id started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n" + "11:27:21 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "11:27:21 INFO - pipeline id pipeline_id\n", + "11:27:21 INFO - code location None\n", + "11:27:21 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "11:27:21 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:21 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:21 INFO - orchestrator doc_id started at 2025-01-29 11:27:21\n", + "11:27:21 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "11:27:21 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "11:27:21 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "11:27:21 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "11:27:21 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "11:27:21 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "11:27:21 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "11:27:21 INFO - Done processing 6 files, waiting for flush() completion.\n", + "11:27:21 INFO - done flushing in 0.0 sec\n", + "11:27:21 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -874,8 +919,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 26.1 ms, sys: 5.22 ms, total: 31.3 ms\n", - "Wall time: 25.3 ms\n" + "CPU times: user 28 ms, sys: 2.28 ms, total: 30.3 ms\n", + "Wall time: 25.8 ms\n" ] } ], @@ -885,16 +930,16 @@ "from dpk_doc_id.transform_python import DocID\n", "\n", "STAGE = 2\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_text_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", - "\n", - "result = DocID(input_folder= output_text_dir,\n", - " output_folder= output_docid_dir,\n", - " doc_id_doc_column= \"contents\",\n", - " doc_id_hash_column= \"doc_hash\",\n", - " # doc_id_int_column= \"doc_id\",\n", - " doc_id_int_column= \"int_id_column\",\n", - " #doc_id_start_id= 5\n", - " ).transform()\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_pdf2pq_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", + "\n", + "result = DocID(input_folder= output_pdf2pq_dir,\n", + " output_folder= output_docid_dir,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"doc_hash\",\n", + " # doc_id_int_column= \"doc_id_int\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " #doc_id_start_id= 5\n", + " ).transform()\n", "\n", "if result == 0:\n", " print (f\"✅ Stage:{STAGE} completed successfully\")\n", @@ -916,7 +961,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "f3d4aba9", "metadata": { "colab": { @@ -980,13 +1025,13 @@ " 1\n", " 0\n", " 2\n", - " a8502d17-692c-4c88-a2a4-19a19ba07892\n", + " 5f1e1408-481a-4463-b7d1-7048da581607\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-22T22:16:14.071453\n", - " 0.706817\n", + " 2025-01-29T11:27:20.388498\n", + " 0.628371\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -998,13 +1043,13 @@ " 1\n", " 0\n", " 2\n", - " 08f28dfa-e607-4c47-b9d6-66b7b8c193b7\n", + " dc0531e1-bc12-4919-8e27-13763592280e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-22T22:16:15.372196\n", - " 0.631735\n", + " 2025-01-29T11:27:21.754298\n", + " 0.730394\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1016,13 +1061,13 @@ " 1\n", " 0\n", " 11\n", - " 1e279ae8-df6a-4b07-8500-6f0a564f352b\n", + " 74eaf9f3-716d-43c6-9cbc-b1497454d33b\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-22T22:16:13.363045\n", - " 0.796537\n", + " 2025-01-29T11:27:19.749728\n", + " 0.643720\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1034,13 +1079,13 @@ " 1\n", " 0\n", " 11\n", - " 3685b6ed-bd33-49f3-95a5-806d28e8311b\n", + " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-22T22:16:14.738818\n", - " 0.665504\n", + " 2025-01-29T11:27:21.013529\n", + " 0.614578\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1052,17 +1097,35 @@ " 1\n", " 0\n", " 11\n", - " 5a070315-684b-481f-9c9f-76903a720d44\n", + " 523f8ace-a61e-4f27-9970-84581ba6626a\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-22T22:16:11.723987\n", - " 1.010865\n", + " 2025-01-29T11:27:18.200289\n", + " 1.119966\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", " \n", + " \n", + " 5\n", + " earth.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " a41f26ae-fec2-43e6-8225-36bd944b4684\n", + " 14711865278795535908\n", + " pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:27:19.098169\n", + " 0.881966\n", + " earth.pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 1\n", + " \n", " \n", "\n", "" @@ -1074,6 +1137,7 @@ "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", @@ -1081,13 +1145,15 @@ "2 1 0 11 \n", "3 1 0 11 \n", "4 1 0 11 \n", + "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", - "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", - "2 1e279ae8-df6a-4b07-8500-6f0a564f352b 10729312978404042321 pdf \n", - "3 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", - "4 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", + "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", + "2 74eaf9f3-716d-43c6-9cbc-b1497454d33b 10729312978404042321 pdf \n", + "3 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", + "4 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", + "5 a41f26ae-fec2-43e6-8225-36bd944b4684 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1095,23 +1161,26 @@ "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", - "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", - "2 2025-01-22T22:16:13.363045 0.796537 earth2.pdf \n", - "3 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", - "4 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf \n", + "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", + "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", + "2 2025-01-29T11:27:19.749728 0.643720 earth2.pdf \n", + "3 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", + "4 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", + "5 2025-01-29T11:27:19.098169 0.881966 earth.pdf \n", "\n", " doc_hash int_id_column \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 " + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 " ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1119,7 +1188,7 @@ "source": [ "print (\"Displaying contents of : \", output_docid_dir)\n", "output_df = read_parquet_files_as_df(output_docid_dir)\n", - "output_df.head()" + "output_df.head(10)" ] }, { @@ -1150,7 +1219,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "90eddb4c", "metadata": { "colab": { @@ -1172,24 +1241,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:16:15 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator ededup started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "22:16:15 INFO - Starting from the beginning\n", - "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n" + "11:27:21 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "11:27:21 INFO - pipeline id pipeline_id\n", + "11:27:21 INFO - code location None\n", + "11:27:21 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "11:27:21 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:21 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:21 INFO - orchestrator ededup started at 2025-01-29 11:27:21\n", + "11:27:21 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "11:27:21 INFO - Starting from the beginning\n", + "11:27:21 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "11:27:21 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "11:27:21 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "11:27:21 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "11:27:21 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "11:27:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "11:27:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "11:27:22 INFO - done flushing in 0.0 sec\n", + "11:27:22 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1197,8 +1266,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 25.2 ms, sys: 4.28 ms, total: 29.5 ms\n", - "Wall time: 23 ms\n" + "CPU times: user 34.4 ms, sys: 2.97 ms, total: 37.3 ms\n", + "Wall time: 31.2 ms\n" ] } ], @@ -1211,10 +1280,10 @@ "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", "\n", "result = Ededup(input_folder=output_docid_dir,\n", - " output_folder=output_exact_dedupe_dir,\n", - " ededup_doc_column=\"contents\",\n", - " ededup_doc_id_column=\"doc_hash\"\n", - " ).transform()\n", + " output_folder=output_exact_dedupe_dir,\n", + " ededup_doc_column=\"contents\",\n", + " ededup_doc_id_column=\"doc_hash\"\n", + " ).transform()\n", "\n", "if result == 0:\n", " print (f\"✅ Stage:{STAGE} completed successfully\")\n", @@ -1236,7 +1305,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "1887b26d", "metadata": { "colab": { @@ -1304,13 +1373,13 @@ " 1\n", " 0\n", " 2\n", - " a8502d17-692c-4c88-a2a4-19a19ba07892\n", + " 5f1e1408-481a-4463-b7d1-7048da581607\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-22T22:16:14.071453\n", - " 0.706817\n", + " 2025-01-29T11:27:20.388498\n", + " 0.628371\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1323,13 +1392,13 @@ " 1\n", " 0\n", " 2\n", - " 08f28dfa-e607-4c47-b9d6-66b7b8c193b7\n", + " dc0531e1-bc12-4919-8e27-13763592280e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-22T22:16:15.372196\n", - " 0.631735\n", + " 2025-01-29T11:27:21.754298\n", + " 0.730394\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1342,13 +1411,13 @@ " 1\n", " 0\n", " 11\n", - " 1e279ae8-df6a-4b07-8500-6f0a564f352b\n", + " 74eaf9f3-716d-43c6-9cbc-b1497454d33b\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-22T22:16:13.363045\n", - " 0.796537\n", + " 2025-01-29T11:27:19.749728\n", + " 0.643720\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1361,13 +1430,13 @@ " 1\n", " 0\n", " 11\n", - " 3685b6ed-bd33-49f3-95a5-806d28e8311b\n", + " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-22T22:16:14.738818\n", - " 0.665504\n", + " 2025-01-29T11:27:21.013529\n", + " 0.614578\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1380,13 +1449,13 @@ " 1\n", " 0\n", " 11\n", - " 5a070315-684b-481f-9c9f-76903a720d44\n", + " 523f8ace-a61e-4f27-9970-84581ba6626a\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-22T22:16:11.723987\n", - " 1.010865\n", + " 2025-01-29T11:27:18.200289\n", + " 1.119966\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1412,11 +1481,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", - "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", - "2 1e279ae8-df6a-4b07-8500-6f0a564f352b 10729312978404042321 pdf \n", - "3 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", - "4 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", + "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", + "2 74eaf9f3-716d-43c6-9cbc-b1497454d33b 10729312978404042321 pdf \n", + "3 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", + "4 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1426,11 +1495,11 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", - "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", - "2 2025-01-22T22:16:13.363045 0.796537 earth2.pdf \n", - "3 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", - "4 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf \n", + "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", + "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", + "2 2025-01-29T11:27:19.749728 0.643720 earth2.pdf \n", + "3 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", + "4 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1440,7 +1509,7 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1456,7 +1525,7 @@ "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", "\n", "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", - "output_df.head()" + "output_df.head(10)" ] }, { @@ -1503,7 +1572,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "37430b60", "metadata": { "colab": { @@ -1525,109 +1594,109 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:16:15 INFO - Starting SignatureCalculation step\n", - "22:16:15 INFO - Got parameters for SignatureCalculation\n", - "22:16:15 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "22:16:15 INFO - data factory scdata_ is using local configuration without input/output path\n", - "22:16:15 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator minhash started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:16:15 WARNING - table is empty, skipping processing\n", - "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:16:15 INFO - Starting flush()\n", - "22:16:15 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", - "22:16:15 INFO - done flushing in 0.024 sec\n", - "22:16:15 INFO - Completed execution in 0.001 min, execution result 0\n", - "22:16:15 INFO - SignatureCalculation completed successfully\n", - "22:16:15 INFO - Starting ClusterAnalysis step\n", - "22:16:15 INFO - Got parameters for ClusterAnalysis\n", - "22:16:15 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator cluster started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of folders is 14\n", - "22:16:15 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "22:16:15 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "22:16:15 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "22:16:15 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "22:16:15 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "22:16:15 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "22:16:15 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 8 files (57.14%) in 0.0 min\n", - "22:16:15 INFO - Completed 9 files (64.29%) in 0.0 min\n", - "22:16:15 INFO - Completed 10 files (71.43%) in 0.0 min\n", - "22:16:15 INFO - Completed 11 files (78.57%) in 0.0 min\n", - "22:16:15 INFO - Completed 12 files (85.71%) in 0.0 min\n", - "22:16:15 INFO - Completed 13 files (92.86%) in 0.0 min\n", - "22:16:15 INFO - Completed 14 files (100.0%) in 0.001 min\n", - "22:16:15 INFO - Done processing 14 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 0.001 min, execution result 0\n", - "22:16:15 INFO - ClusterAnalysis completed successfully\n", - "22:16:15 INFO - Starting GetDuplicateList step\n", - "22:16:15 INFO - Got parameters for GetDuplicateList\n", - "22:16:15 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator fdlist started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of folders is 1\n", - "22:16:15 INFO - Get Duplicate List for folder docs_to_remove\n", - "22:16:15 INFO - 1 documents marked as duplicates\n", - "22:16:15 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "22:16:15 INFO - Done processing 1 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n", - "22:16:15 INFO - GetDuplicateList completed successfully\n", - "22:16:15 INFO - Starting DataCleaning step\n", - "22:16:15 INFO - Got parameters for DataCleaning\n", - "22:16:15 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "22:16:15 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "22:16:15 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator fdclean started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "22:16:15 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:16:15 WARNING - table is empty, skipping processing\n", - "22:16:15 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:16:15 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:16:15 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:16:15 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n", - "22:16:15 INFO - DataCleaning completed successfully\n" + "11:27:22 INFO - Starting SignatureCalculation step\n", + "11:27:22 INFO - Got parameters for SignatureCalculation\n", + "11:27:22 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "11:27:22 INFO - data factory scdata_ is using local configuration without input/output path\n", + "11:27:22 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - pipeline id pipeline_id\n", + "11:27:22 INFO - code location None\n", + "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - orchestrator minhash started at 2025-01-29 11:27:22\n", + "11:27:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "11:27:22 INFO - Completed 1 files (16.67%) in 0.001 min\n", + "11:27:22 WARNING - table is empty, skipping processing\n", + "11:27:22 INFO - Completed 2 files (33.33%) in 0.001 min\n", + "11:27:22 INFO - Completed 3 files (50.0%) in 0.001 min\n", + "11:27:22 INFO - Completed 4 files (66.67%) in 0.001 min\n", + "11:27:22 INFO - Completed 5 files (83.33%) in 0.001 min\n", + "11:27:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "11:27:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "11:27:22 INFO - Starting flush()\n", + "11:27:22 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "11:27:22 INFO - done flushing in 0.028 sec\n", + "11:27:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "11:27:22 INFO - SignatureCalculation completed successfully\n", + "11:27:22 INFO - Starting ClusterAnalysis step\n", + "11:27:22 INFO - Got parameters for ClusterAnalysis\n", + "11:27:22 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "11:27:22 INFO - pipeline id pipeline_id\n", + "11:27:22 INFO - code location None\n", + "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - orchestrator cluster started at 2025-01-29 11:27:22\n", + "11:27:22 INFO - Number of folders is 14\n", + "11:27:22 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "11:27:22 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "11:27:22 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "11:27:22 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "11:27:22 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "11:27:22 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "11:27:22 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "11:27:22 INFO - Completed 8 files (57.14%) in 0.001 min\n", + "11:27:22 INFO - Completed 9 files (64.29%) in 0.001 min\n", + "11:27:22 INFO - Completed 10 files (71.43%) in 0.001 min\n", + "11:27:22 INFO - Completed 11 files (78.57%) in 0.001 min\n", + "11:27:22 INFO - Completed 12 files (85.71%) in 0.001 min\n", + "11:27:22 INFO - Completed 13 files (92.86%) in 0.001 min\n", + "11:27:22 INFO - Completed 14 files (100.0%) in 0.001 min\n", + "11:27:22 INFO - Done processing 14 files, waiting for flush() completion.\n", + "11:27:22 INFO - done flushing in 0.0 sec\n", + "11:27:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "11:27:22 INFO - ClusterAnalysis completed successfully\n", + "11:27:22 INFO - Starting GetDuplicateList step\n", + "11:27:22 INFO - Got parameters for GetDuplicateList\n", + "11:27:22 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "11:27:22 INFO - pipeline id pipeline_id\n", + "11:27:22 INFO - code location None\n", + "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - orchestrator fdlist started at 2025-01-29 11:27:22\n", + "11:27:22 INFO - Number of folders is 1\n", + "11:27:22 INFO - Get Duplicate List for folder docs_to_remove\n", + "11:27:22 INFO - 1 documents marked as duplicates\n", + "11:27:22 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "11:27:22 INFO - Done processing 1 files, waiting for flush() completion.\n", + "11:27:22 INFO - done flushing in 0.0 sec\n", + "11:27:22 INFO - Completed execution in 0.0 min, execution result 0\n", + "11:27:22 INFO - GetDuplicateList completed successfully\n", + "11:27:22 INFO - Starting DataCleaning step\n", + "11:27:22 INFO - Got parameters for DataCleaning\n", + "11:27:22 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "11:27:22 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "11:27:22 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - pipeline id pipeline_id\n", + "11:27:22 INFO - code location None\n", + "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - orchestrator fdclean started at 2025-01-29 11:27:22\n", + "11:27:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "11:27:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "11:27:22 WARNING - table is empty, skipping processing\n", + "11:27:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "11:27:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "11:27:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "11:27:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "11:27:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "11:27:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "11:27:22 INFO - done flushing in 0.0 sec\n", + "11:27:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "11:27:22 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 237 ms, sys: 82.4 ms, total: 320 ms\n", - "Wall time: 209 ms\n" + "CPU times: user 223 ms, sys: 99.6 ms, total: 322 ms\n", + "Wall time: 282 ms\n" ] } ], @@ -1673,7 +1742,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "573faba2", "metadata": { "colab": { @@ -1741,13 +1810,13 @@ " 1\n", " 0\n", " 2\n", - " a8502d17-692c-4c88-a2a4-19a19ba07892\n", + " 5f1e1408-481a-4463-b7d1-7048da581607\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-22T22:16:14.071453\n", - " 0.706817\n", + " 2025-01-29T11:27:20.388498\n", + " 0.628371\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1760,13 +1829,13 @@ " 1\n", " 0\n", " 2\n", - " 08f28dfa-e607-4c47-b9d6-66b7b8c193b7\n", + " dc0531e1-bc12-4919-8e27-13763592280e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-22T22:16:15.372196\n", - " 0.631735\n", + " 2025-01-29T11:27:21.754298\n", + " 0.730394\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1779,13 +1848,13 @@ " 1\n", " 0\n", " 11\n", - " 3685b6ed-bd33-49f3-95a5-806d28e8311b\n", + " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-22T22:16:14.738818\n", - " 0.665504\n", + " 2025-01-29T11:27:21.013529\n", + " 0.614578\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1798,13 +1867,13 @@ " 1\n", " 0\n", " 11\n", - " 5a070315-684b-481f-9c9f-76903a720d44\n", + " 523f8ace-a61e-4f27-9970-84581ba6626a\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-22T22:16:11.723987\n", - " 1.010865\n", + " 2025-01-29T11:27:18.200289\n", + " 1.119966\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1828,10 +1897,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", - "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", - "2 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", - "3 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", + "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", + "2 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", + "3 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1840,10 +1909,10 @@ "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-22T22:16:14.071453 0.706817 lorem-ipsum.pdf \n", - "1 2025-01-22T22:16:15.372196 0.631735 spam.pdf \n", - "2 2025-01-22T22:16:14.738818 0.665504 mars.pdf \n", - "3 2025-01-22T22:16:11.723987 1.010865 earth-copy.pdf \n", + "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", + "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", + "2 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", + "3 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1852,7 +1921,7 @@ "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 [] " ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1868,7 +1937,7 @@ "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", "\n", "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", - "output_df.head()" + "output_df.head(10)" ] }, { @@ -1901,7 +1970,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "b485f598", "metadata": { "colab": { @@ -1923,27 +1992,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:16:15 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-3-pdf-processing-r1.0.0-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "22:16:15 INFO - data factory docq_ is using local configuration without input/output path\n", - "22:16:15 INFO - data factory docq_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - pipeline id pipeline_id\n", - "22:16:15 INFO - code location None\n", - "22:16:15 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "22:16:15 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:16:15 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:16:15 INFO - orchestrator docq started at 2025-01-22 22:16:15\n", - "22:16:15 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", - "22:16:15 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-3-pdf-processing-r1.0.0-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "22:16:15 INFO - Completed 1 files (20.0%) in 0.0 min\n", - "22:16:15 WARNING - table is empty, skipping processing\n", - "22:16:15 INFO - Completed 2 files (40.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 3 files (60.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 4 files (80.0%) in 0.0 min\n", - "22:16:15 INFO - Completed 5 files (100.0%) in 0.0 min\n", - "22:16:15 INFO - Done processing 5 files, waiting for flush() completion.\n", - "22:16:15 INFO - done flushing in 0.0 sec\n", - "22:16:15 INFO - Completed execution in 0.0 min, execution result 0\n" + "11:27:22 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "11:27:22 INFO - data factory docq_ is using local configuration without input/output path\n", + "11:27:22 INFO - data factory docq_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - pipeline id pipeline_id\n", + "11:27:22 INFO - code location None\n", + "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:27:22 INFO - orchestrator docq started at 2025-01-29 11:27:22\n", + "11:27:22 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "11:27:22 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "11:27:22 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "11:27:22 WARNING - table is empty, skipping processing\n", + "11:27:22 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "11:27:22 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "11:27:22 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "11:27:22 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "11:27:22 INFO - Done processing 5 files, waiting for flush() completion.\n", + "11:27:22 INFO - done flushing in 0.0 sec\n", + "11:27:22 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1951,8 +2020,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 36.1 ms, sys: 2.93 ms, total: 39 ms\n", - "Wall time: 33.4 ms\n" + "CPU times: user 34.5 ms, sys: 5.21 ms, total: 39.7 ms\n", + "Wall time: 35.4 ms\n" ] } ], @@ -1988,14 +2057,16 @@ "\n", "We will see several new columns starting with the name **docq_**.\n", "\n", - "We will look at a metric **docq_contain_bad_word** and filter out any documents that have bad words.\n", + "Look at the column **docq_contain_bad_word**; this will flag documents with 'bad words'.\n", + "\n", + "Also inspect the column **docq_lorem_ipsum_ratio**; this will flag documents with 'lorem ipsum' text\n", "\n", "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "1f3225f8", "metadata": { "colab": { @@ -2065,7 +2136,7 @@ " 1\n", " 0\n", " 2\n", - " a8502d17-692c-4c88-a2a4-19a19ba07892\n", + " 5f1e1408-481a-4463-b7d1-7048da581607\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2089,7 +2160,7 @@ " 1\n", " 0\n", " 2\n", - " 08f28dfa-e607-4c47-b9d6-66b7b8c193b7\n", + " dc0531e1-bc12-4919-8e27-13763592280e\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2113,7 +2184,7 @@ " 1\n", " 0\n", " 11\n", - " 3685b6ed-bd33-49f3-95a5-806d28e8311b\n", + " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2137,7 +2208,7 @@ " 1\n", " 0\n", " 11\n", - " 5a070315-684b-481f-9c9f-76903a720d44\n", + " 523f8ace-a61e-4f27-9970-84581ba6626a\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2173,10 +2244,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 a8502d17-692c-4c88-a2a4-19a19ba07892 6571294142213095721 pdf \n", - "1 08f28dfa-e607-4c47-b9d6-66b7b8c193b7 10026122586747302274 pdf \n", - "2 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", - "3 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", + "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", + "2 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", + "3 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2211,7 +2282,7 @@ "[4 rows x 27 columns]" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2240,7 +2311,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "5dac1c70", "metadata": { "colab": { @@ -2303,7 +2374,7 @@ " 1\n", " 0\n", " 11\n", - " 3685b6ed-bd33-49f3-95a5-806d28e8311b\n", + " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2327,7 +2398,7 @@ " 1\n", " 0\n", " 11\n", - " 5a070315-684b-481f-9c9f-76903a720d44\n", + " 523f8ace-a61e-4f27-9970-84581ba6626a\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2359,8 +2430,8 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "2 3685b6ed-bd33-49f3-95a5-806d28e8311b 7758129997476962679 pdf \n", - "3 5a070315-684b-481f-9c9f-76903a720d44 14711865278795535908 pdf \n", + "2 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", + "3 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", @@ -2385,7 +2456,7 @@ "[2 rows x 27 columns]" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2399,7 +2470,7 @@ "# also filter out 'lorem ipsum' text\n", "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", "\n", - "clean_docs_df.head()" + "clean_docs_df.head(10)" ] }, { @@ -2414,7 +2485,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", "metadata": { "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207" @@ -2435,7 +2506,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "e06ce4f2", "metadata": { "colab": { @@ -2462,7 +2533,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "1e175302", "metadata": { "colab": { @@ -2497,7 +2568,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "dpk-3-pdf-processing-r1.0.0-py3.11", + "display_name": "dpk-6-pdf-processing-r1.0.0-all-py3.11", "language": "python", "name": "python3" }, diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb index b2feb9135..69e705ae6 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb @@ -7,14 +7,19 @@ "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866" }, "source": [ - "# Data Prep Kit Demo 1 - Ray Version\n", + "# Processing PDFs using Data Prep Kit (Ray version)\n", "\n", "This notebook will introduce DPK and showcase some of it's capabilities.\n", "\n", - "Here is the workflow\n", + "Here is the workflow:\n", "\n", - "![](https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/images/data-prep-kit-3-workflow.png)\n", - "\n" + "- pdf2parquet: Extract text from PDF documents\n", + "- docid: compute hashes\n", + "- exact dedupe : filter out identical documents\n", + "- fuzzy dedupe : filter out 'near duplicates'\n", + "- document quality: scoring documents for quality\n", + "\n", + "![](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/images/data-prep-kit-3-workflow.png)\n" ] }, { @@ -28,7 +33,7 @@ "\n", "Two options:\n", "\n", - "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/dpk_intro_1_ray.ipynb)\n", + "- **Option 1 - Google Colab:** easiest option. no setup required. Click this link to open this on google colab. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb)\n", "- **Option 2 - Local python dev environment:** Setup using this [guide](../../../README.md#-getting-started)\n", "\n", "The notebook will work as in both environments" @@ -36,29 +41,12 @@ }, { "cell_type": "markdown", - "id": "eb8b0d5c", - "metadata": { - "id": "eb8b0d5c" - }, - "source": [ - "## Step-1: Inspect the Data\n", - "\n", - "We will use simple PDFs about Solar system. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/intro/input/solar-system)\n", - "\n", - "- [earth.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/earth.pdf)\n", - "- [mars.pdf](https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/intro/input/solar-system/mars.pdf)\n" - ] - }, - { - "cell_type": "markdown", - "id": "39a0ab6e", - "metadata": { - "id": "39a0ab6e" - }, + "id": "25ef1be4", + "metadata": {}, "source": [ - "## Step-2: Figure out Runtime Environment\n", + "## Step-1: Figure out Runtime Environment\n", "\n", - "### 2.1 - Determine runtime\n", + "### 1.1 - Determine runtime\n", "\n", "Determine if we are running on Google colab or local python environment" ] @@ -66,14 +54,8 @@ { "cell_type": "code", "execution_count": 1, - "id": "1fe354b7", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1fe354b7", - "outputId": "6665c654-baa5-46dc-d370-9931e0e9eed3" - }, + "id": "13c97768", + "metadata": {}, "outputs": [ { "name": "stdout", @@ -96,74 +78,33 @@ }, { "cell_type": "markdown", - "id": "8e7c104b", - "metadata": { - "id": "8e7c104b" - }, + "id": "df9594f1", + "metadata": {}, "source": [ - "### 2.2 -Download Data if running on Google Colab" + "### 1.2 - Install dependencies if running on Google Colab" ] }, { "cell_type": "code", "execution_count": 2, - "id": "3309799e", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3309799e", - "outputId": "00d7362e-d675-4aaf-8c87-d99027d9a06c" - }, - "outputs": [], - "source": [ - "if RUNNING_IN_COLAB:\n", - " !mkdir -p 'input/solar-system'\n", - " !wget -O 'input/solar-system/earth.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/earth.pdf'\n", - " !wget -O 'input/solar-system/mars.pdf' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/input/solar-system/mars.pdf'\n", - " !wget -O 'my_utils.py' 'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'" - ] - }, - { - "cell_type": "markdown", - "id": "a5dc2b68", - "metadata": { - "id": "a5dc2b68" - }, - "source": [ - "### 2.3 - Install dependencies if running on Google Colab" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1fcec577", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "1fcec577", - "outputId": "48cf233b-f04e-4b9b-9605-423f87693f10" - }, + "id": "dc538bc3", + "metadata": {}, "outputs": [], "source": [ + "%%capture\n", + "\n", "if RUNNING_IN_COLAB:\n", " ! pip install --default-timeout=100 \\\n", - " data-prep-toolkit==0.2.1 \\\n", - " data-prep-toolkit-transforms==0.2.1 \\\n", - " data-prep-toolkit-transforms-ray==0.2.1 \\\n", - " deepsearch-toolkit" + " data-prep-toolkit-transforms[ray,all]==1.0.0 \\\n", + " humanfriendly" ] }, { "cell_type": "markdown", - "id": "243322b8", - "metadata": { - "id": "243322b8" - }, + "id": "a34c5175", + "metadata": {}, "source": [ - "### 2.4 - Restart Runtime\n", + "### 1.3 - Restart Runtime\n", "\n", "After installing dependencies, be sure restart runtime, so libraries will be loaded\n", "\n", @@ -174,35 +115,17 @@ }, { "cell_type": "markdown", - "id": "e8b10be1", - "metadata": { - "id": "e8b10be1" - }, - "source": [ - "## Step-2: Configuration" - ] - }, - { - "cell_type": "markdown", - "id": "356c66f7", - "metadata": { - "id": "356c66f7" - }, + "id": "113ed1a3", + "metadata": {}, "source": [ - "### 2.1 - Basic Config" + "## Step-2: Configuration & Utils" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "e4YMZrBuFycl", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e4YMZrBuFycl", - "outputId": "1a1d5f01-0856-40b6-8b1c-8187b0c38d64" - }, + "execution_count": 3, + "id": "d4f57ff5", + "metadata": {}, "outputs": [ { "name": "stdout", @@ -223,335 +146,329 @@ " RUNNING_IN_COLAB = False" ] }, + { + "cell_type": "markdown", + "id": "970e692b", + "metadata": {}, + "source": [ + "### 2.2 - Setup input/outpur directories" + ] + }, { "cell_type": "code", - "execution_count": 5, - "id": "33345487", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "33345487", - "outputId": "f3e71a25-4864-4f8f-dfce-4af3d7e08a8a" - }, + "execution_count": 4, + "id": "74ed9531", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MY_CONFIG.RAY_RUNTIME_WORKERS: 2\n", - "MY_CONFIG.RAY_NUM_CPUS: 0.8\n", - "MY_CONFIG.RAY_MEMORY_GB: 2\n" + "✅ Cleared output directory\n" ] } ], "source": [ - "import os\n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = 'input/solar-system'\n", - "\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", + "import os, sys\n", + "import shutil\n", "\n", - "## RAY CONFIGURATION\n", - "### For local runs, we can use more parallelism\n", - "### For google colab, be conservative\n", + "input_dir = \"input\"\n", + "shutil.os.makedirs(input_dir, exist_ok=True)\n", + "output_dir = \"output\"\n", "\n", - "if RUNNING_IN_COLAB:\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.3\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - "else: # local run\n", - " num_cpus_available = os.cpu_count()\n", - " # print (num_cpus_available)\n", + "output_pdf2pq_dir = os.path.join (output_dir, '01_pdf2pq_out')\n", + "output_docid_dir = os.path.join (output_dir, '02_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (output_dir, '03_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (output_dir, '04_fuzzy_dedupe_out')\n", + "output_doc_quality_dir = os.path.join (output_dir, '05_doc_quality_out')\n", + "output_final_dir = os.path.join (output_dir, 'output_final')\n", "\n", - " MY_CONFIG.RAY_RUNTIME_WORKERS = 2\n", - " MY_CONFIG.RAY_NUM_CPUS = 0.8\n", - " MY_CONFIG.RAY_MEMORY_GB = 2 # GB\n", - " # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3\n", + "## clear output folder\n", + "shutil.rmtree(output_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_dir, exist_ok=True)\n", + "print (\"✅ Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "3a3bf77f", + "metadata": {}, + "source": [ + "### 2.3 - Runtime Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "991f58d9", + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.utils import GB\n", "\n", - "print ('MY_CONFIG.RAY_RUNTIME_WORKERS:', MY_CONFIG.RAY_RUNTIME_WORKERS)\n", - "print ('MY_CONFIG.RAY_NUM_CPUS:', MY_CONFIG.RAY_NUM_CPUS)\n", - "print ('MY_CONFIG.RAY_MEMORY_GB:', MY_CONFIG.RAY_MEMORY_GB)\n" + "CONFIG_RAY_NUM_CPUS = 1 # CPUs per worker\n", + "CONFIG_RAY_MEMORY = 2 * GB # memory per worker\n", + "CONFIG_RAY_RUNTIME_WORKERS = 2" + ] + }, + { + "cell_type": "markdown", + "id": "f40af9e1", + "metadata": {}, + "source": [ + "### 2.4 - Handy Utils" ] }, { "cell_type": "code", "execution_count": 6, - "id": "b15e6827", - "metadata": { - "id": "b15e6827" - }, + "id": "df47deb1", + "metadata": {}, "outputs": [], "source": [ - "## Add parent dir to path\n", - "import os,sys\n", + "import os\n", + "import requests\n", + "from humanfriendly import format_size\n", + "import pandas as pd\n", + "import glob\n", + "\n", + "## Reads parquet files in a folder into a pandas dataframe\n", + "def read_parquet_files_as_df (parquet_dir):\n", + " parquet_files = glob.glob(f'{parquet_dir}/*.parquet')\n", + " # read each parquet file into a DataFrame and store in a list\n", + " dfs = [pd.read_parquet (f) for f in parquet_files]\n", + " dfs = [df for df in dfs if not df.empty] # filter out empty dataframes\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " if len(dfs) > 0:\n", + " data_df = pd.concat(dfs, ignore_index=True)\n", + " return data_df\n", + " else:\n", + " return pd.DataFrame() # return empty df\n", + "# ------------\n", + "\n", + "\n", + "def download_file(url, local_file, chunk_size=1024*1024):\n", + " \"\"\"\n", + " Downloads a remote URL to a local file.\n", + "\n", + " Args:\n", + " url (str): The remote URL.\n", + " local_filename (str): The name of the local file to save the downloaded content.\n", + " chunk_size (int): The size in bytes of each chunk. Defaults to 1024.\n", + "\n", + " Returns:\n", + " None\n", + "\n", + " Example usage:\n", + " download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB\n", + " \"\"\"\n", + " # Check if the local file already exists\n", + " if os.path.exists(local_file):\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"Local file '{local_file}' ({file_size}) already exists. Skipping download.\")\n", + " return\n", + "\n", + " # Create the directory if it doesn't exist\n", + " os.makedirs(os.path.dirname(local_file), exist_ok=True)\n", + "\n", + " # Stream the file download\n", + " with requests.get(url, stream=True) as r:\n", + " r.raise_for_status()\n", + " with open(local_file, 'wb') as f:\n", + " for chunk in r.iter_content(chunk_size=chunk_size):\n", + " if chunk: # filter out keep-alive new chunks\n", + " f.write(chunk)\n", + " print()\n", + " file_size = format_size(os.path.getsize(local_file))\n", + " print(f\"{local_file} ({file_size}) downloaded successfully.\")\n", + "## --- end: download_file ------\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "f5be5e73", + "metadata": {}, + "source": [ + "## Step-3: Inspect the Data\n", + "\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/input/)\n", "\n", - "this_dir = os.path.abspath('')\n", - "parent_dir = os.path.dirname(this_dir)\n", - "sys.path.append (os.path.abspath (parent_dir))" + "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" ] }, { "cell_type": "markdown", - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", - "metadata": { - "id": "72510ae6-48b0-4b88-9e13-a623281c3a63" - }, + "id": "b20947ae", + "metadata": {}, "source": [ - "### 2.2 - Setup input/outpur directories" + "### 3.1 -Download Data" ] }, { "cell_type": "code", "execution_count": 7, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "60ac8bee-0960-4309-b225-d7a211b14262", - "outputId": "ec5beb05-027a-49eb-9a96-271471619d81" - }, + "id": "f4cc5e1f", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "✅ Cleared output directory\n" + "Local file 'input/earth.pdf' (58.53 KB) already exists. Skipping download.\n", + "Local file 'input/earth-copy.pdf' (58.53 KB) already exists. Skipping download.\n", + "Local file 'input/earth2.pdf' (58.53 KB) already exists. Skipping download.\n", + "Local file 'input/mars.pdf' (57.87 KB) already exists. Skipping download.\n", + "Local file 'input/spam.pdf' (24.87 KB) already exists. Skipping download.\n", + "Local file 'input/lorem-ipsum.pdf' (25.72 KB) already exists. Skipping download.\n" ] } ], "source": [ - "import os, sys\n", - "import shutil\n", "\n", - "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", - " raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", - "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", - "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", - "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", - "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", - "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - "## clear output folder\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", - "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - "print (\"✅ Cleared output directory\")" + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + "\n", + "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))" ] }, { "cell_type": "markdown", - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", - "metadata": { - "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb" - }, + "id": "06fef91e", + "metadata": {}, "source": [ - "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", + "## Step-4: Extract Data from PDF (pdf2parquet)\n", "\n", - "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", - "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", - "\n" + "This step we will read PDF files and extract the text data.\n", + "\n", + "[Pdf2Parquet documentation](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/README.md)\n", + "\n", + "We use the [Docling package](https://github.com/DS4SD/docling).\n" ] }, { "cell_type": "markdown", - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", - "metadata": { - "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a" - }, + "id": "b27cc402", + "metadata": {}, "source": [ - "### 3.1 - Set Input/output Folder" + "### 4.1 - Execute" ] }, { "cell_type": "code", "execution_count": 8, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "482605b2-d814-456d-9195-49a2ec454ef0", - "outputId": "f8383739-a4fb-450c-dc37-5df32aab8212" - }, + "id": "50f2c6a5", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-1: Processing input='input/solar-system' --> output='output/01_parquet_out'\n" + "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_pdf2pq_out'\n", + "\n" ] - } - ], - "source": [ - "STAGE = 1\n", - "\n", - "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = output_parquet_dir\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", - "metadata": { - "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b" - }, - "source": [ - "### 3.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", - "outputId": "14a36e73-a186-4431-a755-f46ccb691130" - }, - "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "13:30:44 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", - "13:30:44 INFO - pipeline id pipeline_id\n", - "13:30:44 INFO - code location None\n", - "13:30:44 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1}\n", - "13:30:44 INFO - actor creation delay 0\n", - "13:30:44 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:30:44 INFO - data factory data_ is using local data access: input_folder - input/solar-system output_folder - output/01_parquet_out\n", - "13:30:44 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:44 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "13:30:44 INFO - Running locally\n", - "2024-10-18 13:30:47,436\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - orchestrator started at 2024-10-18 13:30:50\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of files is 2, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.0551910400390625, 'total_file_size': 0.11101436614990234}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.872821807861328, 'object_store': 7.436410903930664}\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:50 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 110376.42it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10098)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - Completed processing 2 files in 0.145 min\n", - "\u001b[36m(orchestrate pid=9266)\u001b[0m 13:30:59 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m 13:30:53 INFO - Initializing models\n", - "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 73713.60it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=10099)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "13:31:09 INFO - Completed execution in 0.421 min, execution result 0\n" + "11:30:38 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "11:30:38 INFO - pipeline id pipeline_id\n", + "11:30:38 INFO - code location None\n", + "11:30:38 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "11:30:38 INFO - actor creation delay 0\n", + "11:30:38 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:30:38 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_pdf2pq_out\n", + "11:30:38 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:30:38 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "11:30:38 INFO - Running locally\n", + "2025-01-29 11:30:39,945\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - orchestrator started at 2025-01-29 11:30:43\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.069149781018496, 'object_store': 4.534574889577925}\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=3068041)\u001b[0m 11:30:47 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 35378.38it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=3068041)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(RayTransformFileProcessor pid=3068042)\u001b[0m 11:30:47 INFO - Initializing models\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:58 INFO - Completed 1 files in 0.032 min\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:58 INFO - Completed 2 files in 0.033 min\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 183246.29it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=3068042)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:00 INFO - Completed 3 files in 0.063 min\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:00 INFO - Completed 4 files in 0.063 min\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:00 INFO - Completed 4 files (66.667%) in 0.063 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:02 INFO - Completed processing 6 files in 0.09 min\n", + "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:02 INFO - done flushing in 0.001 sec\n", + "11:31:12 INFO - Completed execution in 0.56 min, execution result 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "✅ Stage:1 completed successfully\n", - "CPU times: user 4.41 s, sys: 1.39 s, total: 5.8 s\n", - "Wall time: 31.1 s\n" + "✅ Stage:1 completed successfully\n" ] } ], "source": [ - "%%time\n", - "\n", - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from pdf2parquet_transform import (\n", - " pdf2parquet_contents_type_cli_param,\n", - " pdf2parquet_contents_types,\n", - ")\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", - "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", - "\n", - "from data_processing.utils import GB, ParamsUtils\n", - "\n", - "\n", - "# create parameters\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS, \"memory\": MY_CONFIG.RAY_MEMORY_GB * GB}\n", - "ingest_config = {\n", - " pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n", - "}\n", - "\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": 1, # so model download to cleanup works properly\n", - "\n", - "}\n", - "\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", - "# create launcher\n", - "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", - "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", + "from dpk_pdf2parquet.ray.transform import Pdf2Parquet\n", + "from dpk_pdf2parquet.transform import pdf2parquet_contents_types\n", "\n", - "if return_code == 0:\n", + "STAGE = 1\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_dir}' --> output='{output_pdf2pq_dir}'\\n\", flush=True)\n", + "\n", + "\n", + "result = Pdf2Parquet(input_folder= input_dir,\n", + " output_folder= output_pdf2pq_dir,\n", + " data_files_to_use=['.pdf'],\n", + " pdf2parquet_contents_type=pdf2parquet_contents_types.MARKDOWN, # markdown\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + "\n", + "if result == 0:\n", " print (f\"✅ Stage:{STAGE} completed successfully\")\n", "else:\n", - " raise Exception (\"❌ Ray job failed\")\n" + " raise Exception (f\"❌ Stage:{STAGE} failed\")" ] }, { "cell_type": "markdown", - "id": "5ca790e0", - "metadata": { - "id": "5ca790e0" - }, + "id": "159a5d67", + "metadata": {}, "source": [ - "### 3.3 - Inspect Generated output\n", + "### 4.2 - Inspect Generated output\n", "\n", "Here we should see one entry per input file processed." ] }, { "cell_type": "code", - "execution_count": 10, - "id": "fe59563d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "id": "fe59563d", - "outputId": "d10c022d-524f-4a13-ebf8-6431114e9172" - }, + "execution_count": 9, + "id": "82f04cd9", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Output dimensions (rows x columns)= (2, 12)\n" + "Displaying contents of : output/01_pdf2pq_out\n" ] }, { @@ -581,6 +498,7 @@ " num_tables\n", " num_doc_elements\n", " document_id\n", + " document_hash\n", " ext\n", " hash\n", " size\n", @@ -592,32 +510,98 @@ " \n", " \n", " 0\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-29T11:31:00.543154\n", + " 1.836573\n", + " lorem-ipsum.pdf\n", + " \n", + " \n", + " 1\n", + " spam.pdf\n", + " Free xxx\n", + " 1\n", + " 0\n", + " 2\n", + " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " 10026122586747302274\n", + " pdf\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 8\n", + " 2025-01-29T11:31:02.125197\n", + " 1.579146\n", + " spam.pdf\n", + " \n", + " \n", + " 2\n", + " earth2.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 10729312978404042321\n", + " pdf\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 610\n", + " 2025-01-29T11:31:00.528443\n", + " 1.846497\n", + " earth2.pdf\n", + " \n", + " \n", + " 3\n", " mars.pdf\n", - " {\"_name\":\"\",\"type\":\"pdf-document\",\"description...\n", + " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", + " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 7758129997476962679\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 717\n", + " 2025-01-29T11:31:02.115064\n", + " 1.583783\n", " mars.pdf\n", " \n", " \n", - " 1\n", + " 4\n", + " earth-copy.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " da835156-76ca-435f-bc0f-4fb1fca46097\n", + " 14711865278795535908\n", + " pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:30:58.702357\n", + " 1.957911\n", + " earth-copy.pdf\n", + " \n", + " \n", + " 5\n", " earth.pdf\n", - " {\"_name\":\"\",\"type\":\"pdf-document\",\"description...\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", + " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 14711865278795535908\n", " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:30:58.677819\n", + " 1.933069\n", " earth.pdf\n", " \n", " \n", @@ -625,36 +609,57 @@ "" ], "text/plain": [ - " filename contents num_pages \\\n", - "0 mars.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", - "1 earth.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 1 \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "5 1 0 11 \n", "\n", - " num_tables num_doc_elements document_id ext \\\n", - "0 0 11 62e5639f-f922-4ccc-a041-3cb02f1cfd83 pdf \n", - "1 0 11 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 pdf \n", + " document_id document_hash ext \\\n", + "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", + "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", + "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", + "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", + "4 da835156-76ca-435f-bc0f-4fb1fca46097 14711865278795535908 pdf \n", + "5 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", "\n", " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " date_acquired pdf_convert_time source_filename \n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.494027 2.015123 earth.pdf " + " date_acquired pdf_convert_time source_filename \n", + "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", + "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", + "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", + "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", + "4 2025-01-29T11:30:58.702357 1.957911 earth-copy.pdf \n", + "5 2025-01-29T11:30:58.677819 1.933069 earth.pdf " ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", - "\n", - "output_df.head(5)\n", + "print (\"Displaying contents of : \", output_pdf2pq_dir)\n", + "output_df = read_parquet_files_as_df(output_pdf2pq_dir)\n", + "# print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "output_df.head(10)\n", "\n", "## To display certain columns\n", "#parquet_df[['column1', 'column2', 'column3']].head(5)" @@ -662,511 +667,223 @@ }, { "cell_type": "markdown", - "id": "e5058a21", - "metadata": { - "id": "e5058a21" - }, + "id": "56232298", + "metadata": {}, "source": [ "\n", - "### 3.4 - Understand the output\n", + "### 4.3 - Understand the output\n", "\n", "Here are some interesting attributes to note:\n", "\n", "- **filename** : original filename\n", "- **contents** : text\n", "- **document_id**: unique id (UUID) assignd to this document\n", - "- **hash** : hash of document\n", + "- **document_hash**: hash of documents\n", + "- **hash** : hash of `contents` column\n", "- **pdf_convert_time** : time to convert this pdf in seconds\n", "\n", - "Let's inspect the **contents** column. See how the text is being divided up!" + "**Note: you should notice the hash values are identical for the duplicate documents**\n", + "\n", + "Let's inspect the **contents** column." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4bcc03dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Earth\n", + "\n", + "## Solar System\n", + "\n", + "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", + "\n", + "For more details about our Solar system see Chapter 1.\n", + "\n", + "## Earth\n", + "\n", + "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", + "\n", + "Basic facts about Earth:\n", + "\n", + "- · Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", + "- · Moons: One moon, called Luna or simply \"the Moon\".\n", + "- · Rotation Period: 24 hours (one day)\n" + ] + } + ], + "source": [ + "print (output_df[output_df['filename'] == 'earth.pdf'].iloc[0,]['contents'])" ] }, { "cell_type": "code", "execution_count": 11, - "id": "f870e624", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f870e624", - "outputId": "9142246b-988c-4674-99d7-e2f3fffbaaf4" - }, + "id": "9d07a30e", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '1a83f43f3a202e3f203c1263e36961ecc45d401aad488f638fc5559a584333b2',\n", - " 'filename': 'mars.pdf',\n", - " 'page-hashes': [{'hash': '551fe7a9bde2a9302f150c0a79a13fcc0868fcf73ac6afb80be645c1174734a0',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.35137939,\n", - " 654.45184326,\n", - " 169.88169861,\n", - " 667.98492432],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.09541321,\n", - " 630.68127441,\n", - " 210.66503906,\n", - " 642.34405518],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.84518433,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.02520752],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.18510437,\n", - " 570.83258057,\n", - " 374.99838257,\n", - " 581.07043457],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about the Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.22866821,\n", - " 542.98168945,\n", - " 163.86282349,\n", - " 554.45288086],\n", - " 'page': 1,\n", - " 'span': [0, 4]}],\n", - " 'text': 'Mars',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87440491,\n", - " 500.84011841,\n", - " 477.48345947,\n", - " 534.55810547],\n", - " 'page': 1,\n", - " 'span': [0, 196]}],\n", - " 'text': 'Mars, the fourth planet from the Sun, is a cold, '\n", - " 'desert world with a thin atmosphere composed '\n", - " 'primarily of carbon dioxide. Its reddish hue comes '\n", - " 'from iron oxide, or rust, prevalent on its surface.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.2026062,\n", - " 482.90710449,\n", - " 237.04431152,\n", - " 493.07443237],\n", - " 'page': 1,\n", - " 'span': [0, 23]}],\n", - " 'text': 'Basic facts about Mars:',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 453.019104,\n", - " 477.48171997,\n", - " 474.9703064],\n", - " 'page': 1,\n", - " 'span': [0, 78]}],\n", - " 'text': '· Distance from the Sun: Average of 228 million '\n", - " 'kilometers (142 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.79351807,\n", - " 431.73287964,\n", - " 451.2142334],\n", - " 'page': 1,\n", - " 'span': [0, 64]}],\n", - " 'text': '· Rotation Period: 24.6 hours (one Martian day - '\n", - " 'called a \"sol\")',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 429.10913086,\n", - " 365.9559021,\n", - " 438.83737183],\n", - " 'page': 1,\n", - " 'span': [0, 44]}],\n", - " 'text': '· Moons: Two small moons, Phobos and Deimos.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.51646423],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" + "Free xxx\n" ] } ], "source": [ - "import pprint\n", - "import json\n", - "\n", - "pprint.pprint (json.loads(output_df.iloc[0, ]['contents']))\n", - "# json.loads(output_df.iloc[0, ]['contents'])" + "print (output_df[output_df['filename'] == 'spam.pdf'].iloc[0,]['contents'])\n" ] }, { "cell_type": "code", "execution_count": 12, - "id": "e1a10c2d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e1a10c2d", - "outputId": "ca74113e-6fd3-488b-836a-60bd58299fb1" - }, + "id": "866857df", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'_name': '',\n", - " 'description': {'logs': []},\n", - " 'equations': [],\n", - " 'figures': [],\n", - " 'file-info': {'#-pages': 1,\n", - " 'document-hash': '7401ae81637dbb89e7040dcd5945bbfb75ff8648bb761c69f8a1595e86538748',\n", - " 'filename': 'earth.pdf',\n", - " 'page-hashes': [{'hash': 'ca802e4bd5a3301792808caea2a47db51f0520888875b77fc230c99ee851c19b',\n", - " 'model': 'default',\n", - " 'page': 1}]},\n", - " 'footnotes': [],\n", - " 'main-text': [{'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.30961609,\n", - " 654.45184326,\n", - " 174.04208374,\n", - " 667.93347168],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.12528992,\n", - " 630.69073486,\n", - " 210.66503906,\n", - " 642.27935791],\n", - " 'page': 1,\n", - " 'span': [0, 12]}],\n", - " 'text': 'Solar System',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.87112427,\n", - " 588.96014404,\n", - " 479.40917969,\n", - " 623.04595947],\n", - " 'page': 1,\n", - " 'span': [0, 205]}],\n", - " 'text': 'Our solar system is a vast and fascinating expanse, '\n", - " 'comprising eight planets, five dwarf planets, '\n", - " 'numerous moons, asteroids, comets, and other '\n", - " 'celestial bodies. At its center lies the star we call '\n", - " 'the Sun.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.20942688,\n", - " 570.81555176,\n", - " 375.57919312,\n", - " 581.08459473],\n", - " 'page': 1,\n", - " 'span': [0, 54]}],\n", - " 'text': 'For more details about our Solar system see Chapter '\n", - " '1.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Section-header',\n", - " 'prov': [{'bbox': [133.15542603,\n", - " 542.98168945,\n", - " 167.32983398,\n", - " 554.36669922],\n", - " 'page': 1,\n", - " 'span': [0, 5]}],\n", - " 'text': 'Earth',\n", - " 'type': 'subtitle-level-1'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [132.91053772,\n", - " 512.46295166,\n", - " 477.84887695,\n", - " 534.48431396],\n", - " 'page': 1,\n", - " 'span': [0, 107]}],\n", - " 'text': \"Earth is the third planet from the Sun. It's our home \"\n", - " 'planet. Earth is the only place we know of with life.',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Text',\n", - " 'prov': [{'bbox': [133.30151367,\n", - " 494.86206055,\n", - " 240.17156982,\n", - " 505.07229614],\n", - " 'page': 1,\n", - " 'span': [0, 24]}],\n", - " 'text': 'Basic facts about Earth:',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 464.97409058,\n", - " 477.47979736,\n", - " 487.02810669],\n", - " 'page': 1,\n", - " 'span': [0, 79]}],\n", - " 'text': '· Distance from the Sun: Average of 149.6 million '\n", - " 'kilometers (93 million miles)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 452.86901855,\n", - " 317.90722656,\n", - " 463.24041748],\n", - " 'page': 1,\n", - " 'span': [0, 37]}],\n", - " 'text': '· Rotation Period: 24 hours (one day)',\n", - " 'type': 'paragraph'},\n", - " {'name': 'List-item',\n", - " 'prov': [{'bbox': [145.94500732,\n", - " 440.71496582,\n", - " 396.66357422,\n", - " 451.19915771],\n", - " 'page': 1,\n", - " 'span': [0, 52]}],\n", - " 'text': '· Moons: One moon, called Luna or simply \"the Moon\".',\n", - " 'type': 'paragraph'},\n", - " {'name': 'Page-footer',\n", - " 'prov': [{'bbox': [303.13299561,\n", - " 87.20314026,\n", - " 308.11428833,\n", - " 96.53633118],\n", - " 'page': 1,\n", - " 'span': [0, 1]}],\n", - " 'text': '1',\n", - " 'type': 'page-footer'}],\n", - " 'page-dimensions': [{'height': 792.0, 'page': 1, 'width': 612.0}],\n", - " 'page-footers': [],\n", - " 'page-headers': [],\n", - " 'tables': [],\n", - " 'type': 'pdf-document'}\n" + "Lorem ipsum Lorem ipsum Lorem ipsum\n" ] } ], "source": [ - "pprint.pprint (json.loads(output_df.iloc[1, ]['contents']))" + "print (output_df[output_df['filename'] == 'lorem-ipsum.pdf'].iloc[0,]['contents'])" ] }, { "cell_type": "markdown", - "id": "72274586", - "metadata": { - "id": "72274586" - }, + "id": "270f1673", + "metadata": {}, "source": [ - "## Step-4: Doc chunks\n", + "## Step-5: Create DOC ID for Documents\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", "\n", - "In the previous step, we have extracted text from oru PDFs. But we have the content of entire file as 'one row' in our parquet output.\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", "\n", - "In this step, we are going to split the documents in chunks, according to their layout segmentation.\n", + "**This step is a pre-requisite for fuzzy dedup** in the pipeline.\n", "\n", - "This transform uses [Quackling](https://github.com/DS4SD/quackling) `HierarchicalChunker`\n", - "to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc.\n", - "It relies on documents converted with the Docling library in the [pdf2parquet transform](https://github.com/IBM/data-prep-kit/blob/dev/transforms/language/pdf2parquet/python/README.md) using the option `contents_type: \"application/json\"`,\n", - "which provides the required JSON structure." + "[DocID documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/doc_id)" ] }, { "cell_type": "markdown", - "id": "96198fa6", - "metadata": { - "id": "96198fa6" - }, + "id": "32478bb0", + "metadata": {}, "source": [ - "### 4.1 - Set Input/output Folder" + "### 5.1 - Execute" ] }, { "cell_type": "code", "execution_count": 13, - "id": "305f00a3", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "305f00a3", - "outputId": "689f1531-7007-49d9-9a27-39c39f8f2c50" - }, + "id": "9b0f613b", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" + "🏃🏼 STAGE-2: Processing input='output/01_pdf2pq_out' --> output='output/02_docid_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "11:31:13 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "11:31:13 INFO - pipeline id pipeline_id\n", + "11:31:13 INFO - code location None\n", + "11:31:13 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "11:31:13 INFO - actor creation delay 0\n", + "11:31:13 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:31:13 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "11:31:13 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:31:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:31:13 INFO - Running locally\n", + "2025-01-29 11:31:14,619\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - orchestrator started at 2025-01-29 11:31:15\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.08399505726993, 'object_store': 4.541997527703643}\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - done flushing in 0.001 sec\n", + "11:31:27 INFO - Completed execution in 0.224 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:2 completed successfully\n", + "CPU times: user 124 ms, sys: 162 ms, total: 285 ms\n", + "Wall time: 14.9 s\n" ] } ], "source": [ - "STAGE = 2\n", - "\n", - "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_chunk_dir\n", + "%%time\n", "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "from dpk_doc_id.ray.transform import DocID\n", "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "369f2cd1", - "metadata": { - "id": "369f2cd1" - }, - "source": [ - "### 4.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5b7b18d5", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5b7b18d5", - "outputId": "0146bd91-2ccb-4e56-c649-f415a38bfcf8" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:31:12 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "13:31:12 INFO - pipeline id pipeline_id\n", - "13:31:12 INFO - code location None\n", - "13:31:12 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:12 INFO - actor creation delay 0\n", - "13:31:12 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:12 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "13:31:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:12 INFO - Running locally\n", - "2024-10-18 13:31:14,121\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - orchestrator started at 2024-10-18 13:31:16\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of files is 2, source profile {'max_file_size': 0.02239513397216797, 'min_file_size': 0.02167987823486328, 'total_file_size': 0.04407501220703125}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.963891602121294, 'object_store': 7.4819458005949855}\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - Completed processing 2 files in 0.032 min\n", - "\u001b[36m(orchestrate pid=10912)\u001b[0m 13:31:18 INFO - done flushing in 0.001 sec\n", - "13:31:28 INFO - Completed execution in 0.269 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:2 completed successfully\n", - "CPU times: user 982 ms, sys: 291 ms, total: 1.27 s\n", - "Wall time: 18.9 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_chunk_transform_ray import DocChunkRayTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc_chunk arguments\n", - " # ...\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(DocChunkRayTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", + "STAGE = 2\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_pdf2pq_dir}' --> output='{output_docid_dir}'\\n\", flush=True)\n", + "\n", + "result = DocID(input_folder= output_pdf2pq_dir,\n", + " output_folder= output_docid_dir,\n", + " doc_id_doc_column= \"contents\",\n", + " doc_id_hash_column= \"doc_hash\",\n", + " # doc_id_int_column= \"doc_id_int\",\n", + " doc_id_int_column= \"int_id_column\",\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + " \n", + "if result == 0:\n", " print (f\"✅ Stage:{STAGE} completed successfully\")\n", "else:\n", - " raise Exception (\"❌ Ray job failed\")" + " raise Exception (f\"❌ Stage:{STAGE} failed\")" ] }, { "cell_type": "markdown", - "id": "213afdf6", - "metadata": { - "id": "213afdf6" - }, + "id": "af2de0e5", + "metadata": {}, "source": [ - "### 4.3 - Inspect Generated output\n", + "### 5.2 - Inspect Generated output\n", "\n", - "We would see documents are split into many chunks" + "You would see a new columns **doc_hash** and **int_id_column**" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "d8138d43", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 897 - }, - "id": "d8138d43", - "outputId": "e1758b0c-5f22-4368-c3e6-ff778fc9ae82" - }, + "execution_count": 14, + "id": "38b6e1cc", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Files processed : 2\n", - "Chunks created : 8\n", - "Input data dimensions (rows x columns)= (2, 12)\n", - "Output data dimensions (rows x columns)= (8, 16)\n" + "Displaying contents of : output/02_docid_out\n" ] }, { @@ -1191,300 +908,324 @@ " \n", " \n", " filename\n", + " contents\n", " num_pages\n", " num_tables\n", " num_doc_elements\n", + " document_id\n", + " document_hash\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", " source_filename\n", - " source_document_id\n", - " contents\n", - " doc_jsonpath\n", - " page_number\n", - " bbox\n", - " document_id\n", + " doc_hash\n", + " int_id_column\n", " \n", " \n", " \n", " \n", " 0\n", - " mars.pdf\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", " 1\n", " 0\n", - " 11\n", + " 2\n", + " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " 6571294142213095721\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Solar System\\nOur solar system is a vast and f...\n", - " $.main-text[2]\n", - " 1\n", - " [132.84518433, 588.96014404, 479.40917969, 623...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-29T11:31:00.543154\n", + " 1.836573\n", + " lorem-ipsum.pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 3\n", " \n", " \n", " 1\n", - " mars.pdf\n", + " spam.pdf\n", + " Free xxx\n", " 1\n", " 0\n", - " 11\n", + " 2\n", + " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " 10026122586747302274\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Solar System\\nFor more details about the Solar...\n", - " $.main-text[3]\n", - " 1\n", - " [133.18510437, 570.83258057, 374.99838257, 581...\n", - " dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 8\n", + " 2025-01-29T11:31:02.125197\n", + " 1.579146\n", + " spam.pdf\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 5\n", " \n", " \n", " 2\n", - " mars.pdf\n", + " earth2.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 10729312978404042321\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Mars\\nMars, the fourth planet from the Sun, is...\n", - " $.main-text[5]\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 610\n", + " 2025-01-29T11:31:00.528443\n", + " 1.846497\n", + " earth2.pdf\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 1\n", - " [132.87440491, 500.84011841, 477.48345947, 534...\n", - " a31663e06fac41470ecc459f5a58658a3f9997d7801053...\n", " \n", " \n", " 3\n", " mars.pdf\n", + " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", + " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 7758129997476962679\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 717\n", + " 2025-01-29T11:31:02.115064\n", + " 1.583783\n", " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Basic facts about Mars:\\n· Distance from the S...\n", - " $.main-text[6]\n", - " 1\n", - " [133.2026062, 482.90710449, 237.04431152, 493....\n", - " 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 4\n", " \n", " \n", " 4\n", - " earth.pdf\n", - " 1\n", - " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Solar System\\nOur solar system is a vast and f...\n", - " $.main-text[2]\n", - " 1\n", - " [132.87112427, 588.96014404, 479.40917969, 623...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " \n", - " \n", - " 5\n", - " earth.pdf\n", + " earth-copy.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " da835156-76ca-435f-bc0f-4fb1fca46097\n", + " 14711865278795535908\n", " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Solar System\\nFor more details about our Solar...\n", - " $.main-text[3]\n", - " 1\n", - " [133.20942688, 570.81555176, 375.57919312, 581...\n", - " d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...\n", - " \n", - " \n", - " 6\n", - " earth.pdf\n", - " 1\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:30:58.702357\n", + " 1.957911\n", + " earth-copy.pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Earth\\nEarth is the third planet from the Sun....\n", - " $.main-text[5]\n", - " 1\n", - " [132.91053772, 512.46295166, 477.84887695, 534...\n", - " 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...\n", " \n", " \n", - " 7\n", + " 5\n", " earth.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 14711865278795535908\n", " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:30:58.677819\n", + " 1.933069\n", " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Earth\\nBasic facts about Earth:\\n· Distance fr...\n", - " $.main-text[6]\n", - " 1\n", - " [133.30151367, 494.86206055, 240.17156982, 505...\n", - " 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "5 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "5 1 0 11 \n", "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + " document_id document_hash ext \\\n", + "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", + "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", + "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", + "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", + "4 da835156-76ca-435f-bc0f-4fb1fca46097 14711865278795535908 pdf \n", + "5 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", + "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", + "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", + "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", + "4 2025-01-29T11:30:58.702357 1.957911 earth-copy.pdf \n", + "5 2025-01-29T11:30:58.677819 1.933069 earth.pdf \n", "\n", - " document_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... " + " doc_hash int_id_column \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 2 " ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", + "print (\"Displaying contents of : \", output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "141f7cf1", + "metadata": {}, + "source": [ + "## Step-6: Eliminate Duplicate Documents\n", "\n", - "print (f\"Files processed : {input_df.shape[0]:,}\")\n", - "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", + "We have 2 exact duplicates: **earth.pdf** , **earth-copy.pdf**\n", "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "Note how **doc_hash** for these documents are the same.\n", "\n", - "output_df.head(10)" + "[Exact dedupe information](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/ededup)" ] }, { "cell_type": "markdown", - "id": "9e9ca75c", - "metadata": { - "id": "9e9ca75c" - }, + "id": "eb74af84", + "metadata": {}, "source": [ - "### 4.4 - Understanding the Output\n", - "\n", - "Here we see 2 PDF files are split into 6 chunks. Basically we see the documents are being split along 'natural boundaris' - paragraphs and bullet points\n", + "### 6.1 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "48beaa13", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-3: Processing input='output/02_docid_out' --> output='output/03_exact_dedupe_out'\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "11:31:28 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "11:31:28 INFO - pipeline id pipeline_id\n", + "11:31:28 INFO - code location None\n", + "11:31:28 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "11:31:28 INFO - actor creation delay 0\n", + "11:31:28 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:31:28 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "11:31:28 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:31:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:31:28 INFO - Running locally\n", + "2025-01-29 11:31:29,530\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - orchestrator started at 2025-01-29 11:31:30\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.072644806466997, 'object_store': 4.536322402767837}\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:32 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:32 INFO - done flushing in 0.001 sec\n", + "11:31:42 INFO - Completed execution in 0.223 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:3 completed successfully\n", + "CPU times: user 126 ms, sys: 128 ms, total: 254 ms\n", + "Wall time: 14.6 s\n" + ] + } + ], + "source": [ + "%%time\n", "\n", - "See how **document_id** is carried throughout. This helps us identify original documents.\n", + "from dpk_ededup.ray.transform import Ededup\n", + "\n", + "STAGE = 3\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_docid_dir}' --> output='{output_exact_dedupe_dir}'\\n\", flush=True)\n", + "\n", + "result = Ededup(input_folder=output_docid_dir,\n", + " output_folder=output_exact_dedupe_dir,\n", + " ededup_doc_column=\"contents\",\n", + " ededup_doc_id_column=\"doc_hash\",\n", + " ededup_num_hashes= 2,\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + "\n", + "if result == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (f\"❌ Stage:{STAGE} failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "d9d93e16", + "metadata": {}, + "source": [ + "### 6.2 - Inspect Generated output\n", "\n", - "Also note **contents** is now plain text (not JSON as before)" + "You can see one of **earth.pdf** or **earth-copy.pdf** will be eliminated." ] }, { "cell_type": "code", "execution_count": 16, - "id": "3090c950", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - }, - "id": "3090c950", - "outputId": "3f542446-2cfa-404c-c642-3732f7b74568" - }, + "id": "ef98911d", + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input files before exact dedupe : 6\n", + "Output files after exact dedupe : 5\n", + "Duplicate files removed : 1\n", + "Displaying contents of : output/03_exact_dedupe_out\n" + ] + }, { "data": { "text/html": [ @@ -1508,63 +1249,164 @@ " \n", " filename\n", " contents\n", + " num_pages\n", + " num_tables\n", + " num_doc_elements\n", + " document_id\n", + " document_hash\n", + " ext\n", + " hash\n", + " size\n", + " date_acquired\n", + " pdf_convert_time\n", + " source_filename\n", + " doc_hash\n", + " int_id_column\n", + " removed\n", " \n", " \n", " \n", " \n", " 0\n", - " mars.pdf\n", - " Solar System\\nOur solar system is a vast and f...\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", + " 1\n", + " 0\n", + " 2\n", + " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " 6571294142213095721\n", + " pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-29T11:31:00.543154\n", + " 1.836573\n", + " lorem-ipsum.pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 3\n", + " []\n", " \n", " \n", " 1\n", - " mars.pdf\n", - " Solar System\\nFor more details about the Solar...\n", + " spam.pdf\n", + " Free xxx\n", + " 1\n", + " 0\n", + " 2\n", + " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " 10026122586747302274\n", + " pdf\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 8\n", + " 2025-01-29T11:31:02.125197\n", + " 1.579146\n", + " spam.pdf\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 5\n", + " []\n", " \n", " \n", " 2\n", - " mars.pdf\n", - " Mars\\nMars, the fourth planet from the Sun, is...\n", + " earth2.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 10729312978404042321\n", + " pdf\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 610\n", + " 2025-01-29T11:31:00.528443\n", + " 1.846497\n", + " earth2.pdf\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 1\n", + " []\n", " \n", " \n", " 3\n", " mars.pdf\n", - " Basic facts about Mars:\\n· Distance from the S...\n", + " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", + " 1\n", + " 0\n", + " 11\n", + " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 7758129997476962679\n", + " pdf\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 717\n", + " 2025-01-29T11:31:02.115064\n", + " 1.583783\n", + " mars.pdf\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 4\n", + " []\n", " \n", " \n", " 4\n", " earth.pdf\n", - " Solar System\\nOur solar system is a vast and f...\n", - " \n", - " \n", - " 5\n", - " earth.pdf\n", - " Solar System\\nFor more details about our Solar...\n", - " \n", - " \n", - " 6\n", - " earth.pdf\n", - " Earth\\nEarth is the third planet from the Sun....\n", - " \n", - " \n", - " 7\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 14711865278795535908\n", + " pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:30:58.677819\n", + " 1.933069\n", " earth.pdf\n", - " Earth\\nBasic facts about Earth:\\n· Distance fr...\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 2\n", + " []\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Solar System\\nFor more details about the Solar...\n", - "2 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "3 mars.pdf Basic facts about Mars:\\n· Distance from the S...\n", - "4 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "5 earth.pdf Solar System\\nFor more details about our Solar...\n", - "6 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "7 earth.pdf Earth\\nBasic facts about Earth:\\n· Distance fr..." + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", + "\n", + " document_id document_hash ext \\\n", + "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", + "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", + "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", + "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", + "4 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", + "\n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", + "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", + "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", + "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", + "4 2025-01-29T11:30:58.677819 1.933069 earth.pdf \n", + "\n", + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 2 [] " ] }, "execution_count": 16, @@ -1573,268 +1415,261 @@ } ], "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d5f151ae", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d5f151ae", - "outputId": "4616d648-0852-4ecb-cef8-f5940e176de0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 3------\n", - "Basic facts about Mars:\n", - "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "· Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "· Rotation Period: 24 hours (one day)\n", - "· Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" + "input_df = read_parquet_files_as_df(output_docid_dir)\n", + "output_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "print (\"Displaying contents of : \", output_exact_dedupe_dir)\n", + "output_df.head(10)" ] }, { "cell_type": "markdown", - "id": "20217298", - "metadata": { - "id": "20217298" - }, + "id": "1cedeca2", + "metadata": {}, "source": [ - "## Step-5: DOC ID generation\n", + "## Step-7: Fuzzy Dedupe\n", "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "In previous step, we removed **exact duplicates (identical documents)**.\n", "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set **hash_column** to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set **int_id_column** to the name of the column, where you want to store it.\n", + "Fuzzy de-dupe can further filter out documents that are **not exactly identical, but nearly identical**\n", + "\n", + "Here is a simple example:\n", + "\n", + "`Our solar system is a vast and fascinating expanse`\n", + "\n", + "`The solar system is a vast and fascinating expanse`\n", + "\n", + "Only one word is different `Our` vs `The`.\n", + "\n", + "Imagine two documents with one extra blank line. For our purposes they are the same.\n", "\n", - "**This is a pre-requisite for fuzzy dedup** in the pipeline." + "[Fuzzy dedupe documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/fdedup)\n", + "\n", + "### Tweaking fuzzy matches\n", + "\n", + "**`jaccard_similarity_threshold`** is the parameter used to tweak similarities between documents. It's value is between 0 and 1.0. Values close to 1.0 means more strict checking (fewer documents will qualify). Lower threshold means more leniant matches (more documents will qualify)\n", + "\n", + "Adjust this value to find what works for your documents" ] }, { "cell_type": "markdown", - "id": "66811f5b", - "metadata": { - "id": "66811f5b" - }, + "id": "3f21d132", + "metadata": {}, "source": [ - "### 5.1 - Set Input/output Folder" + "### 7.1 - Execute" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "1f747c0d", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1f747c0d", - "outputId": "e42500b7-5d1e-41fd-b53b-34d3393f36f4" - }, + "execution_count": 17, + "id": "f6430f24", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" + "🏃🏼 STAGE-4: Processing input='output/03_exact_dedupe_out' --> output='output/04_fuzzy_dedupe_out'\n", + "\n" ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE = 3\n", - "\n", - "input_folder = output_chunk_dir\n", - "output_folder = output_docid_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "18aa0fe1", - "metadata": { - "id": "18aa0fe1" - }, - "source": [ - "### 5.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "f6e9e145", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "f6e9e145", - "outputId": "2add5f0c-3ab6-4336-8a7b-ac8b1b76ab73" - }, - "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "13:31:29 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", - "13:31:29 INFO - pipeline id pipeline_id\n", - "13:31:29 INFO - code location None\n", - "13:31:29 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:29 INFO - actor creation delay 0\n", - "13:31:29 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:29 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", - "13:31:29 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:29 INFO - Running locally\n", - "2024-10-18 13:31:31,792\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - orchestrator started at 2024-10-18 13:31:32\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of files is 2, source profile {'max_file_size': 0.008975982666015625, 'min_file_size': 0.008897781372070312, 'total_file_size': 0.017873764038085938}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.033103181049228, 'object_store': 7.516551589593291}\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:32 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - Completed processing 2 files in 0.012 min\n", - "\u001b[36m(orchestrate pid=12291)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", - "13:31:43 INFO - Completed execution in 0.228 min, execution result 0\n" + "11:31:43 INFO - Starting SignatureCalculation step\n", + "11:31:43 INFO - Got parameters for SignatureCalculation\n", + "11:31:43 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "11:31:43 INFO - data factory scdata_ is using local configuration without input/output path\n", + "11:31:43 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "11:31:43 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:31:43 INFO - pipeline id pipeline_id\n", + "11:31:43 INFO - code location None\n", + "11:31:43 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "11:31:43 INFO - actor creation delay 0\n", + "11:31:43 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:31:43 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "11:31:43 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:31:43 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:31:43 INFO - Running locally\n", + "2025-01-29 11:31:44,268\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - orchestrator started at 2025-01-29 11:31:45\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.094626618549228, 'object_store': 4.547313308343291}\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - done flushing in 0.026 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=3073102)\u001b[0m 11:31:46 WARNING - table is empty, skipping processing\n", + "\u001b[36m(RayTransformFileProcessor pid=3073102)\u001b[0m 11:31:46 INFO - Starting flush()\n", + "\u001b[36m(RayTransformFileProcessor pid=3073100)\u001b[0m 11:31:46 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", + "11:31:56 INFO - Completed execution in 0.225 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=3073101)\u001b[0m 11:31:46 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=3073101)\u001b[0m 11:31:46 INFO - Wrote 14 tables with a total size of 26,880 bytes\n", + "11:31:58 INFO - SignatureCalculation completed successfully\n", + "11:31:58 INFO - Starting ClusterAnalysis step\n", + "11:31:58 INFO - Got parameters for ClusterAnalysis\n", + "11:31:58 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "11:31:58 INFO - pipeline id pipeline_id\n", + "11:31:58 INFO - code location None\n", + "11:31:58 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "11:31:58 INFO - actor creation delay 0\n", + "11:31:58 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:31:58 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "11:31:58 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:31:58 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:31:58 INFO - Running locally\n", + "2025-01-29 11:31:59,071\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - orchestrator started at 2025-01-29 11:32:00\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.028518676757812, 'object_store': 4.514259338378906}\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 7 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 8 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 9 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 10 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 11 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed processing 14 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - done flushing in 0.001 sec\n", + "11:32:11 INFO - Completed execution in 0.222 min, execution result 0\n", + "11:32:12 INFO - ClusterAnalysis completed successfully\n", + "11:32:12 INFO - Starting GetDuplicateList step\n", + "11:32:12 INFO - Got parameters for GetDuplicateList\n", + "11:32:12 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "11:32:12 INFO - pipeline id pipeline_id\n", + "11:32:12 INFO - code location None\n", + "11:32:12 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "11:32:12 INFO - actor creation delay 0\n", + "11:32:12 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:32:12 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "11:32:12 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:32:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:32:12 INFO - Running locally\n", + "2025-01-29 11:32:13,701\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - orchestrator started at 2025-01-29 11:32:14\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.045405579730868, 'object_store': 4.522702788934112}\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:16 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:16 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=3076312)\u001b[0m 11:32:16 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=3076312)\u001b[0m 11:32:16 INFO - 0 documents marked as duplicates\n", + "11:32:26 INFO - Completed execution in 0.222 min, execution result 0\n", + "11:32:27 INFO - GetDuplicateList completed successfully\n", + "11:32:27 INFO - Starting DataCleaning step\n", + "11:32:27 INFO - Got parameters for DataCleaning\n", + "11:32:27 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "11:32:27 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "11:32:27 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "11:32:27 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:32:27 INFO - pipeline id pipeline_id\n", + "11:32:27 INFO - code location None\n", + "11:32:27 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "11:32:27 INFO - actor creation delay 0\n", + "11:32:27 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:32:27 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "11:32:27 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:32:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:32:27 INFO - Running locally\n", + "2025-01-29 11:32:28,365\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - orchestrator started at 2025-01-29 11:32:29\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.023682404309511, 'object_store': 4.511841201223433}\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=3077841)\u001b[0m 11:32:30 WARNING - table is empty, skipping processing\n", + "11:32:40 INFO - Completed execution in 0.227 min, execution result 0\n", + "11:32:42 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "✅ Stage:3 completed successfully\n", - "CPU times: user 123 ms, sys: 145 ms, total: 267 ms\n", - "Wall time: 15.2 s\n" + "CPU times: user 517 ms, sys: 575 ms, total: 1.09 s\n", + "Wall time: 59.1 s\n" ] } ], "source": [ "%%time\n", "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # doc id configuration\n", - " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"chunk_hash\",\n", - " \"doc_id_int_column\": \"chunk_id\",\n", - "}\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", + "from dpk_fdedup.ray.transform import Fdedup\n", "\n", - "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", + "STAGE = 4\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_exact_dedupe_dir}' --> output='{output_fuzzy_dedupe_dir}'\\n\", flush=True)\n", "\n", - "return_code = launcher.launch()\n", + "result = Fdedup(input_folder=output_exact_dedupe_dir,\n", + " output_folder=output_fuzzy_dedupe_dir,\n", + " contents_column= \"contents\",\n", + " # document_id_column= \"doc_id\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " jaccard_similarity_threshold = 0.9, # between 0 - 1. higher means more strict checking\n", + " operation_mode=\"filter_duplicates\",\n", + " # operation_mode=\"annotate\",\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " ).transform()\n", "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Ray job failed\")" + "# if result == 0:\n", + "# print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "# else:\n", + "# raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" ] }, { "cell_type": "markdown", - "id": "4954402f", - "metadata": { - "id": "4954402f" - }, + "id": "037d3974", + "metadata": {}, "source": [ - "### 5.3 - Inspect Generated output\n", + "### 7.2 - Inspect Output\n", "\n", - "You will notice we have two extra columns\n", + "FuzzyDedupe will write documents that are filtered in **output/04_fuzzy_dedupe_out/cleaned** folder\n", "\n", - "- **hash_column**\n", - "- **int_id_column**\n", - "\n", - "But still the same number or rows as before" + "You will notice only one **earth.pdf** made it! So fuzzy dedupe did filter out the almost identical doc." ] }, { "cell_type": "code", - "execution_count": 20, - "id": "1911179a", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 860 - }, - "id": "1911179a", - "outputId": "45e83e2a-1f70-46b9-e311-c50f025419be" - }, + "execution_count": 18, + "id": "d59496f0", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (8, 16)\n", - "Output data dimensions (rows x columns)= (8, 18)\n" + "Input files before exact dedupe : 5\n", + "Output files after exact dedupe : 5\n", + "Near duplicate files removed : 0\n", + "Displaying contents of : output/04_fuzzy_dedupe_out\n" ] }, { @@ -1859,479 +1694,321 @@ " \n", " \n", " filename\n", + " contents\n", " num_pages\n", " num_tables\n", " num_doc_elements\n", + " document_id\n", + " document_hash\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", " source_filename\n", - " source_document_id\n", - " contents\n", - " doc_jsonpath\n", - " page_number\n", - " bbox\n", - " document_id\n", - " chunk_hash\n", - " chunk_id\n", + " doc_hash\n", + " int_id_column\n", + " removed\n", " \n", " \n", " \n", " \n", " 0\n", - " mars.pdf\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", " 1\n", " 0\n", - " 11\n", + " 2\n", + " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " 6571294142213095721\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Solar System\\nOur solar system is a vast and f...\n", - " $.main-text[2]\n", - " 1\n", - " [132.84518433, 588.96014404, 479.40917969, 623...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " 4\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " 2025-01-29T11:31:00.543154\n", + " 1.836573\n", + " lorem-ipsum.pdf\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 3\n", + " []\n", " \n", " \n", " 1\n", - " mars.pdf\n", + " spam.pdf\n", + " Free xxx\n", " 1\n", " 0\n", - " 11\n", + " 2\n", + " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " 10026122586747302274\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Solar System\\nFor more details about the Solar...\n", - " $.main-text[3]\n", - " 1\n", - " [133.18510437, 570.83258057, 374.99838257, 581...\n", - " dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...\n", - " dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 8\n", + " 2025-01-29T11:31:02.125197\n", + " 1.579146\n", + " spam.pdf\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", + " []\n", " \n", " \n", " 2\n", - " mars.pdf\n", + " earth2.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 10729312978404042321\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Mars\\nMars, the fourth planet from the Sun, is...\n", - " $.main-text[5]\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 610\n", + " 2025-01-29T11:31:00.528443\n", + " 1.846497\n", + " earth2.pdf\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 1\n", - " [132.87440491, 500.84011841, 477.48345947, 534...\n", - " a31663e06fac41470ecc459f5a58658a3f9997d7801053...\n", - " a31663e06fac41470ecc459f5a58658a3f9997d7801053...\n", - " 6\n", + " []\n", " \n", " \n", " 3\n", " mars.pdf\n", + " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", + " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 7758129997476962679\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 717\n", + " 2025-01-29T11:31:02.115064\n", + " 1.583783\n", " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Basic facts about Mars:\\n· Distance from the S...\n", - " $.main-text[6]\n", - " 1\n", - " [133.2026062, 482.90710449, 237.04431152, 493....\n", - " 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...\n", - " 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...\n", - " 7\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 4\n", + " []\n", " \n", " \n", " 4\n", " earth.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 14711865278795535908\n", " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Solar System\\nOur solar system is a vast and f...\n", - " $.main-text[2]\n", - " 1\n", - " [132.87112427, 588.96014404, 479.40917969, 623...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " 0\n", - " \n", - " \n", - " 5\n", - " earth.pdf\n", - " 1\n", - " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Solar System\\nFor more details about our Solar...\n", - " $.main-text[3]\n", - " 1\n", - " [133.20942688, 570.81555176, 375.57919312, 581...\n", - " d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...\n", - " d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...\n", - " 1\n", - " \n", - " \n", - " 6\n", - " earth.pdf\n", - " 1\n", - " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " 2025-01-29T11:30:58.677819\n", + " 1.933069\n", " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Earth\\nEarth is the third planet from the Sun....\n", - " $.main-text[5]\n", - " 1\n", - " [132.91053772, 512.46295166, 477.84887695, 534...\n", - " 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...\n", - " 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 2\n", - " \n", - " \n", - " 7\n", - " earth.pdf\n", - " 1\n", - " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Earth\\nBasic facts about Earth:\\n· Distance fr...\n", - " $.main-text[6]\n", - " 1\n", - " [133.30151367, 494.86206055, 240.17156982, 505...\n", - " 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...\n", - " 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...\n", - " 3\n", + " []\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 mars.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", - "7 earth.pdf 1 0 11 pdf \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "7 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "7 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "7 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + " document_id document_hash ext \\\n", + "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", + "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", + "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", + "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", + "4 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "2 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "3 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "4 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "5 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "6 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "7 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "2 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "3 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "4 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "5 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "6 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "7 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + " hash size \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", - " document_id \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", + "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", + "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", + "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", + "4 2025-01-29T11:30:58.677819 1.933069 earth.pdf \n", "\n", - " chunk_hash chunk_id \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 \n", - "1 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "2 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "3 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "4 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "5 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "6 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "7 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 " + " doc_hash int_id_column removed \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 2 [] " ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", + "input_df = read_parquet_files_as_df(output_exact_dedupe_dir)\n", + "output_df = read_parquet_files_as_df(os.path.join(output_fuzzy_dedupe_dir, \"cleaned\"))\n", "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "# print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "# print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input files before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output files after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Near duplicate files removed : \", (input_df.shape[0] - output_df.shape[0]))\n", "\n", + "print (\"Displaying contents of : \", output_fuzzy_dedupe_dir)\n", "output_df.head(10)" ] }, { "cell_type": "markdown", - "id": "852829dc", - "metadata": { - "id": "852829dc" - }, + "id": "c3e4f860", + "metadata": {}, "source": [ - "## Step-6: Exact Dedup\n", - "\n" + "## Step-8: Document Quality\n", + "\n", + "This handy plugin will score documents across many metrics.\n", + "\n", + "Here we will look for 'bad words' metric.\n", + "\n", + "[Document quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)\n", + "\n", + "By default it uses [bad words collection](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality/dpk_doc_quality/ldnoobw). You can supply a custom file by passing an argument `bad_word_filepath=/path/to/badwords_file`" ] }, { "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", - "metadata": { - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe" - }, + "id": "144a0fff", + "metadata": {}, "source": [ - "### 6.1 - Set Input/output Folder" + "### 8.1 - Execute" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "4c7a1b94", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4c7a1b94", - "outputId": "40a119b4-44fc-483d-9ad0-da178a2a8eb1" - }, + "execution_count": 19, + "id": "63140942", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" + "🏃🏼 STAGE-5: Processing input='output/04_fuzzy_dedupe_out/cleaned' --> output='output/05_doc_quality_out'\n", + "\n" ] - } - ], - "source": [ - "STAGE = 4\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_exact_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", - "metadata": { - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e" - }, - "source": [ - "### 6.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", - "outputId": "bd0f3f94-8c48-4c6b-b911-858e389243f4" - }, - "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "13:31:45 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "13:31:45 INFO - pipeline id pipeline_id\n", - "13:31:45 INFO - code location None\n", - "13:31:45 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:45 INFO - actor creation delay 0\n", - "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:45 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", - "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:45 INFO - Running locally\n", - "2024-10-18 13:31:47,001\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - orchestrator started at 2024-10-18 13:31:48\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.010423279367387, 'object_store': 7.505211639218032}\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - Completed processing 2 files in 0.013 min\n", - "\u001b[36m(orchestrate pid=13775)\u001b[0m 13:31:48 INFO - done flushing in 0.001 sec\n", - "13:31:58 INFO - Completed execution in 0.228 min, execution result 0\n" + "11:32:42 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "11:32:42 INFO - data factory docq_ is using local configuration without input/output path\n", + "11:32:42 INFO - data factory docq_ max_files -1, n_sample -1\n", + "11:32:42 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:32:42 INFO - pipeline id pipeline_id\n", + "11:32:42 INFO - code location None\n", + "11:32:42 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "11:32:42 INFO - actor creation delay 0\n", + "11:32:42 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", + "11:32:42 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "11:32:42 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:32:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:32:42 INFO - Running locally\n", + "2025-01-29 11:32:43,435\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - orchestrator started at 2025-01-29 11:32:44\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.047461509704589844}\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.050894166342914, 'object_store': 4.525447081774473}\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=3079444)\u001b[0m 11:32:45 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed processing 5 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - done flushing in 0.001 sec\n", + "11:32:56 INFO - Completed execution in 0.226 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=3079443)\u001b[0m 11:32:45 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "✅ Stage:4 completed successfully\n", - "CPU times: user 136 ms, sys: 154 ms, total: 289 ms\n", - "Wall time: 15.2 s\n" + "✅ Stage:5 completed successfully\n", + "CPU times: user 121 ms, sys: 170 ms, total: 290 ms\n", + "Wall time: 15 s\n" ] } ], "source": [ "%%time\n", "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # ededup parameters\n", - " \"ededup_hash_cpu\": 0.5,\n", - " \"ededup_num_hashes\": 2,\n", - " \"ededup_doc_column\": \"contents\",\n", - " \"ededup_doc_id_column\": \"chunk_hash\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", + "from dpk_doc_quality.ray.transform import DocQuality\n", + "\n", + "STAGE = 5\n", + "output_fuzzy_dedupe_cleaned_dir = os.path.join(output_fuzzy_dedupe_dir, \"cleaned\")\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{output_fuzzy_dedupe_cleaned_dir}' --> output='{output_doc_quality_dir}'\\n\", flush=True)\n", + "\n", + "result = DocQuality(input_folder=output_fuzzy_dedupe_cleaned_dir,\n", + " output_folder= output_doc_quality_dir,\n", + " docq_text_lang = \"en\",\n", + " docq_doc_content_column =\"contents\",\n", + " \n", + " # runtime config\n", + " run_locally= True,\n", + " num_cpus= CONFIG_RAY_NUM_CPUS,\n", + " memory= CONFIG_RAY_MEMORY,\n", + " runtime_num_workers = CONFIG_RAY_RUNTIME_WORKERS,\n", + " ).transform()\n", + "\n", + "if result == 0:\n", " print (f\"✅ Stage:{STAGE} completed successfully\")\n", "else:\n", - " raise Exception (\"❌ Ray job failed\")" + " raise Exception (f\"❌ Stage:{STAGE} failed (result={result})\")" ] }, { "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": { - "id": "eaf1c3c3" - }, + "id": "1006b475", + "metadata": {}, "source": [ - "### 6.3 - Inspect Generated output" + "### 8.2 - Inspect the Output\n", + "\n", + "We will see several new columns starting with the name **docq_**.\n", + "\n", + "Look at the column **docq_contain_bad_word**; this will flag documents with 'bad words'.\n", + "\n", + "Also inspect the column **docq_lorem_ipsum_ratio**; this will flag documents with 'lorem ipsum' text\n", + "\n", + "For more information see : [Doc Quality documentation](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_quality)" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "d824ebf6", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 815 - }, - "id": "d824ebf6", - "outputId": "9173efb6-1b95-4a7e-b531-1a611841a4d0" - }, + "execution_count": 20, + "id": "24181587", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (7, 19)\n", - "Input chunks before exact dedupe : 8\n", - "Output chunks after exact dedupe : 7\n", - "Duplicate chunks removed : 1\n" + "Displaying contents of : output/05_doc_quality_out\n" ] }, { @@ -2356,299 +2033,244 @@ " \n", " \n", " filename\n", + " contents\n", " num_pages\n", " num_tables\n", " num_doc_elements\n", + " document_id\n", + " document_hash\n", " ext\n", " hash\n", " size\n", - " date_acquired\n", - " pdf_convert_time\n", - " source_filename\n", - " source_document_id\n", - " contents\n", - " doc_jsonpath\n", - " page_number\n", - " bbox\n", - " document_id\n", - " chunk_hash\n", - " chunk_id\n", - " removed\n", + " ...\n", + " docq_mean_word_len\n", + " docq_symbol_to_word_ratio\n", + " docq_sentence_count\n", + " docq_lorem_ipsum_ratio\n", + " docq_curly_bracket_ratio\n", + " docq_contain_bad_word\n", + " docq_bullet_point_ratio\n", + " docq_ellipsis_line_ratio\n", + " docq_alphabet_word_ratio\n", + " docq_contain_common_en_words\n", " \n", " \n", " \n", " \n", " 0\n", - " mars.pdf\n", + " lorem-ipsum.pdf\n", + " Lorem ipsum Lorem ipsum Lorem ipsum\n", " 1\n", " 0\n", - " 11\n", + " 2\n", + " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " 6571294142213095721\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Solar System\\nFor more details about the Solar...\n", - " $.main-text[3]\n", - " 1\n", - " [133.18510437, 570.83258057, 374.99838257, 581...\n", - " dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...\n", - " dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07...\n", - " 5\n", - " [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567...\n", + " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", + " 35\n", + " ...\n", + " 5.000000\n", + " 0.000000\n", + " 1\n", + " 0.085714\n", + " 0.0\n", + " False\n", + " 0.000000\n", + " 0.0\n", + " 1.000000\n", + " False\n", " \n", " \n", " 1\n", - " mars.pdf\n", + " spam.pdf\n", + " Free xxx\n", " 1\n", " 0\n", - " 11\n", + " 2\n", + " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " 10026122586747302274\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Mars\\nMars, the fourth planet from the Sun, is...\n", - " $.main-text[5]\n", - " 1\n", - " [132.87440491, 500.84011841, 477.48345947, 534...\n", - " a31663e06fac41470ecc459f5a58658a3f9997d7801053...\n", - " a31663e06fac41470ecc459f5a58658a3f9997d7801053...\n", - " 6\n", - " []\n", + " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", + " 8\n", + " ...\n", + " 3.500000\n", + " 0.000000\n", + " 1\n", + " 0.000000\n", + " 0.0\n", + " True\n", + " 0.000000\n", + " 0.0\n", + " 1.000000\n", + " False\n", " \n", " \n", " 2\n", - " mars.pdf\n", + " earth2.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 10729312978404042321\n", " pdf\n", - " 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...\n", - " 2800\n", - " 2024-10-18T13:30:59.490007\n", - " 2.011138\n", - " mars.pdf\n", - " 62e5639f-f922-4ccc-a041-3cb02f1cfd83\n", - " Basic facts about Mars:\\n· Distance from the S...\n", - " $.main-text[6]\n", - " 1\n", - " [133.2026062, 482.90710449, 237.04431152, 493....\n", - " 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...\n", - " 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...\n", - " 7\n", - " []\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 610\n", + " ...\n", + " 4.541284\n", + " 0.027523\n", + " 9\n", + " 0.000000\n", + " 0.0\n", + " False\n", + " 0.176471\n", + " 0.0\n", + " 0.880734\n", + " True\n", " \n", " \n", " 3\n", - " earth.pdf\n", + " mars.pdf\n", + " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", " 1\n", " 0\n", " 11\n", + " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 7758129997476962679\n", " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Solar System\\nOur solar system is a vast and f...\n", - " $.main-text[2]\n", - " 1\n", - " [132.87112427, 588.96014404, 479.40917969, 623...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...\n", - " 0\n", - " []\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 717\n", + " ...\n", + " 4.688000\n", + " 0.032000\n", + " 8\n", + " 0.000000\n", + " 0.0\n", + " False\n", + " 0.176471\n", + " 0.0\n", + " 0.880000\n", + " True\n", " \n", " \n", " 4\n", " earth.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", + " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 14711865278795535908\n", " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Solar System\\nFor more details about our Solar...\n", - " $.main-text[3]\n", - " 1\n", - " [133.20942688, 570.81555176, 375.57919312, 581...\n", - " d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...\n", - " d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...\n", - " 1\n", - " []\n", - " \n", - " \n", - " 5\n", - " earth.pdf\n", - " 1\n", - " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Earth\\nEarth is the third planet from the Sun....\n", - " $.main-text[5]\n", - " 1\n", - " [132.91053772, 512.46295166, 477.84887695, 534...\n", - " 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...\n", - " 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...\n", - " 2\n", - " []\n", - " \n", - " \n", - " 6\n", - " earth.pdf\n", - " 1\n", - " 0\n", - " 11\n", - " pdf\n", - " 18713f970989055625bef22209b6f4b6830b9ca22046bf...\n", - " 2686\n", - " 2024-10-18T13:30:59.494027\n", - " 2.015123\n", - " earth.pdf\n", - " f3c0ac2e-1de2-472b-8216-2043f3b3e9d1\n", - " Earth\\nBasic facts about Earth:\\n· Distance fr...\n", - " $.main-text[6]\n", - " 1\n", - " [133.30151367, 494.86206055, 240.17156982, 505...\n", - " 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...\n", - " 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...\n", - " 3\n", - " []\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " ...\n", + " 4.541284\n", + " 0.027523\n", + " 9\n", + " 0.000000\n", + " 0.0\n", + " False\n", + " 0.176471\n", + " 0.0\n", + " 0.880734\n", + " True\n", " \n", " \n", "\n", + "

5 rows × 27 columns

\n", "" ], "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "6 earth.pdf 1 0 11 pdf \n", + " filename contents \\\n", + "0 lorem-ipsum.pdf Lorem ipsum Lorem ipsum Lorem ipsum \n", + "1 spam.pdf Free xxx \n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "6 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", + " num_pages num_tables num_doc_elements \\\n", + "0 1 0 2 \n", + "1 1 0 2 \n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "6 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", + " document_id document_hash ext \\\n", + "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", + "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", + "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", + "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", + "4 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "6 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", + " hash size ... \\\n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 8 ... \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 ... \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nFor more details about the Solar... $.main-text[3] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "3 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "4 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "5 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "6 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "0 5.000000 0.000000 1 \n", + "1 3.500000 0.000000 1 \n", + "2 4.541284 0.027523 9 \n", + "3 4.688000 0.032000 8 \n", + "4 4.541284 0.027523 9 \n", "\n", - " page_number bbox \\\n", - "0 1 [133.18510437, 570.83258057, 374.99838257, 581... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [132.87112427, 588.96014404, 479.40917969, 623... \n", - "4 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "5 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "6 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "0 0.085714 0.0 False \n", + "1 0.000000 0.0 True \n", + "2 0.000000 0.0 False \n", + "3 0.000000 0.0 False \n", + "4 0.000000 0.0 False \n", "\n", - " document_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... \n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "0 0.000000 0.0 \n", + "1 0.000000 0.0 \n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "4 0.176471 0.0 \n", "\n", - " chunk_hash chunk_id \\\n", - "0 dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... 5 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 \n", - "3 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 0 \n", - "4 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 \n", - "5 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 \n", - "6 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 \n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "0 1.000000 False \n", + "1 1.000000 False \n", + "2 0.880734 True \n", + "3 0.880000 True \n", + "4 0.880734 True \n", "\n", - " removed \n", - "0 [44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] " + "[5 rows x 27 columns]" ] }, - "execution_count": 23, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from my_utils import read_parquet_files_as_df\n", + "output_df = read_parquet_files_as_df(output_doc_quality_dir)\n", + "print (\"Displaying contents of : \", output_doc_quality_dir)\n", + "output_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c343b656", + "metadata": {}, + "source": [ + "### 8.3 - Filtering 'quality' documents\n", "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", + "So from the output above we see **spam.pdf** is flagged for containing bad words (**docq_contain_bad_word=True**).\n", "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "Also **lorem.pdf** is flagged for place holder content **lorem ipsum** (**docq_lorem_ipsum_ratio > 0**)\n", "\n", - "output_df.head(10)" + "We are going to filter them both out" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "82cc9bb0", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 269 - }, - "id": "82cc9bb0", - "outputId": "e043fa01-ceca-49ae-b764-8154219c7b6c" - }, + "execution_count": 21, + "id": "4b3dee53", + "metadata": {}, "outputs": [ { "data": { @@ -2673,1317 +2295,238 @@ " \n", " filename\n", " contents\n", + " num_pages\n", + " num_tables\n", + " num_doc_elements\n", + " document_id\n", + " document_hash\n", + " ext\n", + " hash\n", + " size\n", + " ...\n", + " docq_mean_word_len\n", + " docq_symbol_to_word_ratio\n", + " docq_sentence_count\n", + " docq_lorem_ipsum_ratio\n", + " docq_curly_bracket_ratio\n", + " docq_contain_bad_word\n", + " docq_bullet_point_ratio\n", + " docq_ellipsis_line_ratio\n", + " docq_alphabet_word_ratio\n", + " docq_contain_common_en_words\n", " \n", " \n", " \n", " \n", - " 0\n", - " mars.pdf\n", - " Solar System\\nFor more details about the Solar...\n", - " \n", - " \n", - " 1\n", - " mars.pdf\n", - " Mars\\nMars, the fourth planet from the Sun, is...\n", - " \n", - " \n", " 2\n", - " mars.pdf\n", - " Basic facts about Mars:\\n· Distance from the S...\n", + " earth2.pdf\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 10729312978404042321\n", + " pdf\n", + " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", + " 610\n", + " ...\n", + " 4.541284\n", + " 0.027523\n", + " 9\n", + " 0.0\n", + " 0.0\n", + " False\n", + " 0.176471\n", + " 0.0\n", + " 0.880734\n", + " True\n", " \n", " \n", " 3\n", - " earth.pdf\n", - " Solar System\\nOur solar system is a vast and f...\n", + " mars.pdf\n", + " ## Mars\\n\\n## Solar System\\n\\nOur solar system...\n", + " 1\n", + " 0\n", + " 11\n", + " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 7758129997476962679\n", + " pdf\n", + " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", + " 717\n", + " ...\n", + " 4.688000\n", + " 0.032000\n", + " 8\n", + " 0.0\n", + " 0.0\n", + " False\n", + " 0.176471\n", + " 0.0\n", + " 0.880000\n", + " True\n", " \n", " \n", " 4\n", " earth.pdf\n", - " Solar System\\nFor more details about our Solar...\n", - " \n", - " \n", - " 5\n", - " earth.pdf\n", - " Earth\\nEarth is the third planet from the Sun....\n", - " \n", - " \n", - " 6\n", - " earth.pdf\n", - " Earth\\nBasic facts about Earth:\\n· Distance fr...\n", + " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", + " 1\n", + " 0\n", + " 11\n", + " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 14711865278795535908\n", + " pdf\n", + " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", + " 610\n", + " ...\n", + " 4.541284\n", + " 0.027523\n", + " 9\n", + " 0.0\n", + " 0.0\n", + " False\n", + " 0.176471\n", + " 0.0\n", + " 0.880734\n", + " True\n", " \n", " \n", "\n", + "

3 rows × 27 columns

\n", "" ], "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nFor more details about the Solar...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\n· Distance from the S...\n", - "3 earth.pdf Solar System\\nOur solar system is a vast and f...\n", - "4 earth.pdf Solar System\\nFor more details about our Solar...\n", - "5 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "6 earth.pdf Earth\\nBasic facts about Earth:\\n· Distance fr..." - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "cc61dffa", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cc61dffa", - "outputId": "aff7a0d9-a791-42a5-d5b7-ad643f59f261" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about the Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "· Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 3------\n", - "Earth\n", - "Basic facts about Earth:\n", - "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "· Rotation Period: 24 hours (one day)\n", - "· Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "383f40ba", - "metadata": { - "id": "383f40ba" - }, - "source": [ - "### 6.4 - Understanding the output\n", - "\n", - "Remember we had 8 chunks initially. Now we have 7! One duplicate chunk is removed.\n", - "\n", - "If you look at the PDF, the following common paragraph in `earth.pdf` and `mars.pdf` is removed from one of the documents! Pretty neat, eh!\n", - "\n", - "```text\n", - "## Solar System\n", - "\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", - "metadata": { - "id": "85309751-8556-41c6-ac32-84acc941bc8d" - }, - "source": [ - "## Step-7: Fuzzy Dedup\n", - "\n", - "Post exact deduplication, fuzzy deduplication is applied with the goal of removing code files that may have **slight variations** and thereby unbiasing\n", - "the data further.\n", - "\n", - "Small variations are quite commonly seen in code data in the form of variations in the values of variables, addittion of logging statements etc." - ] - }, - { - "cell_type": "markdown", - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", - "metadata": { - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6" - }, - "source": [ - "### 7.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", - "outputId": "d53a92d2-0f1c-465f-f11c-b9bc2931f651" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-5: Processing input='output/03_docid_out' --> output='output/05_fuzzy_dedupe_out'\n" - ] - } - ], - "source": [ - "## Input to this component is the output of doc_id generator component.\n", - "\n", - "STAGE = 5\n", - "\n", - "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_fuzzy_dedupe_dir\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "markdown", - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", - "metadata": { - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3" - }, - "source": [ - "### 7.2 - Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", - "outputId": "1e63d364-3944-465a-ff7c-6e1dc750b2de" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:00 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 0.8}}\n", - "13:32:00 INFO - pipeline id pipeline_id\n", - "13:32:00 INFO - code location None\n", - "13:32:00 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:00 INFO - actor creation delay 0\n", - "13:32:00 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:00 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/05_fuzzy_dedupe_out\n", - "13:32:00 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:00 INFO - Running locally\n", - "2024-10-18 13:32:02,246\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - orchestrator started at 2024-10-18 13:32:03\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of files is 2, source profile {'max_file_size': 0.010180473327636719, 'min_file_size': 0.010101318359375, 'total_file_size': 0.02028179168701172}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 15.000544739887118, 'object_store': 7.500272369012237}\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - starting run from the beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - continuing from the very beginning\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Fuzzy: num buckets 8, bucket length 8\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 bucket actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 minhash actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - Table preprocessing uses 1 readers\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:03 INFO - created 1 table processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files in 0.064 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:07 INFO - Completed 1 files (50.0%) in 0.064 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - Completed processing 2 files in 0.197 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:15 INFO - creating minhash snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - minhash snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:16 INFO - creating bucket snapshots\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - bucket snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 document actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created 1 bucket processor actors\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - created bucket processor invoker\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - added invoker to bucket collectors\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - processing buckets 0 long, 53 short\n", - "\u001b[36m(BucketsHash pid=16209)\u001b[0m 13:32:17 INFO - Done submitting long buckets\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - Done processing buckets in 0.01 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:17 INFO - creating document snapshots\n", - "\u001b[36m(BucketsHashProcessorInvoker pid=16602)\u001b[0m 13:32:17 INFO - Waiting bucket processing completion. Submitted requests 1\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - document snapshots created\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:18 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - Completed processing 2 files in 0.113 min\n", - "\u001b[36m(orchestrate pid=15368)\u001b[0m 13:32:25 INFO - done flushing in 0.005 sec\n", - "13:32:35 INFO - Completed execution in 0.588 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:5 completed successfully\n", - "CPU times: user 270 ms, sys: 200 ms, total: 470 ms\n", - "Wall time: 36.6 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "\n", - "# create parameters\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # Orchestration parameters\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # columns used\n", - " \"fdedup_doc_column\": \"contents\",\n", - " \"fdedup_id_column\": \"chunk_id\",\n", - " \"fdedup_cluster_column\": \"chunk_hash\",\n", - " # infrastructure\n", - " \"fdedup_bucket_cpu\": 0.3,\n", - " \"fdedup_doc_cpu\": 0.3,\n", - " \"fdedup_mhash_cpu\": 0.3,\n", - " \"fdedup_num_doc_actors\": 1,\n", - " \"fdedup_num_bucket_actors\": 1,\n", - " \"fdedup_num_minhash_actors\": 1,\n", - " \"fdedup_num_preprocessors\": 1,\n", - " # fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "}\n", - "\n", - "# Pass commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# launch\n", - "\n", - "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", - "\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "a6f8cd11", - "metadata": { - "id": "a6f8cd11" - }, - "source": [ - "### 7.3 - Inspect Generated output" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "e899ad60", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 677 - }, - "id": "e899ad60", - "outputId": "fcfda84c-ebbf-490f-f478-ceef7ca9e83b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (8, 18)\n", - "Output data dimensions (rows x columns)= (6, 18)\n", - "Duplicate chunks removed by fuzzy-dedupe: 2\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hash
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\n· Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\n· Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "ab7ea52b", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - }, - "id": "ab7ea52b", - "outputId": "e38754ee-777f-4ed7-ebc0-9299ee122662" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamecontents
0mars.pdfSolar System\\nOur solar system is a vast and f...
1mars.pdfMars\\nMars, the fourth planet from the Sun, is...
2mars.pdfBasic facts about Mars:\\n· Distance from the S...
3earth.pdfSolar System\\nFor more details about our Solar...
4earth.pdfEarth\\nEarth is the third planet from the Sun....
5earth.pdfEarth\\nBasic facts about Earth:\\n· Distance fr...
\n", - "
" - ], - "text/plain": [ - " filename contents\n", - "0 mars.pdf Solar System\\nOur solar system is a vast and f...\n", - "1 mars.pdf Mars\\nMars, the fourth planet from the Sun, is...\n", - "2 mars.pdf Basic facts about Mars:\\n· Distance from the S...\n", - "3 earth.pdf Solar System\\nFor more details about our Solar...\n", - "4 earth.pdf Earth\\nEarth is the third planet from the Sun....\n", - "5 earth.pdf Earth\\nBasic facts about Earth:\\n· Distance fr..." + " document_hash ext \\\n", + "2 10729312978404042321 pdf \n", + "3 7758129997476962679 pdf \n", + "4 14711865278795535908 pdf \n", + "\n", + " hash size ... \\\n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 ... \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 ... \n", + "\n", + " docq_mean_word_len docq_symbol_to_word_ratio docq_sentence_count \\\n", + "2 4.541284 0.027523 9 \n", + "3 4.688000 0.032000 8 \n", + "4 4.541284 0.027523 9 \n", + "\n", + " docq_lorem_ipsum_ratio docq_curly_bracket_ratio docq_contain_bad_word \\\n", + "2 0.0 0.0 False \n", + "3 0.0 0.0 False \n", + "4 0.0 0.0 False \n", + "\n", + " docq_bullet_point_ratio docq_ellipsis_line_ratio \\\n", + "2 0.176471 0.0 \n", + "3 0.176471 0.0 \n", + "4 0.176471 0.0 \n", + "\n", + " docq_alphabet_word_ratio docq_contain_common_en_words \n", + "2 0.880734 True \n", + "3 0.880000 True \n", + "4 0.880734 True \n", + "\n", + "[3 rows x 27 columns]" ] }, - "execution_count": 29, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "output_df[['filename', 'contents']]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "6bdd3515", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6bdd3515", - "outputId": "e6e3f2c0-5b23-4336-bc95-013921f0724a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========== mars.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "Our solar system is a vast and fascinating expanse, comprising eight planets, five dwarf planets, numerous moons, asteroids, comets, and other celestial bodies. At its center lies the star we call the Sun.\n", - "-------\n", - "-------Chunk 1------\n", - "Mars\n", - "Mars, the fourth planet from the Sun, is a cold, desert world with a thin atmosphere composed primarily of carbon dioxide. Its reddish hue comes from iron oxide, or rust, prevalent on its surface.\n", - "-------\n", - "-------Chunk 2------\n", - "Basic facts about Mars:\n", - "· Distance from the Sun: Average of 228 million kilometers (142 million miles)\n", - "· Rotation Period: 24.6 hours (one Martian day - called a \"sol\")\n", - "· Moons: Two small moons, Phobos and Deimos.\n", - "-------\n", - "========== earth.pdf ===========\n", - "-------Chunk 0------\n", - "Solar System\n", - "For more details about our Solar system see Chapter 1.\n", - "-------\n", - "-------Chunk 1------\n", - "Earth\n", - "Earth is the third planet from the Sun. It's our home planet. Earth is the only place we know of with life.\n", - "-------\n", - "-------Chunk 2------\n", - "Earth\n", - "Basic facts about Earth:\n", - "· Distance from the Sun: Average of 149.6 million kilometers (93 million miles)\n", - "· Rotation Period: 24 hours (one day)\n", - "· Moons: One moon, called Luna or simply \"the Moon\".\n", - "-------\n" - ] - } - ], - "source": [ - "for f in output_df['filename'].unique():\n", - " print ('==========' , f, '===========')\n", - " chunks = output_df[output_df['filename'] == f]['contents']\n", - " for idx , chunk in enumerate(chunks):\n", - " print (f'-------Chunk {idx}------\\n{chunk}\\n-------')" - ] - }, - { - "cell_type": "markdown", - "id": "2b34d9c6", - "metadata": { - "id": "2b34d9c6" - }, - "source": [ - "### 7.4- Understanding the output\n", - "\n", - "So we started with 7 rows and ended up with 6. Fuzzy dedupe removed the following **very similar** chunk.\n", - "\n", - "These are pretty similar chunks except for the words 'the' and 'our'\n", - "\n", - "**earth.pdf**\n", - "\n", - "`For more details about *our* Solar system see Chapter 1.`\n", - "\n", - "**mars.pdf**\n", - "\n", - "`For more details about *the* Solar system see Chapter 1.`\n", - "\n", - "Pretty neat, eh? 👏\n", - "\n", - "### Configuring Fuzzy de-dupe\n", - "\n", - "You can tweak fuzzy dedupe by tweaking the following parameters\n", - "\n", - "```python\n", - "# fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.7, # (default 0.8)\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", - "```\n", - "\n", - "In our case, we set `fdedup_threshold` parameter to 0.7. \n" - ] - }, - { - "cell_type": "markdown", - "id": "5370950a-2a3a-4143-8218-f9b4808099ba", - "metadata": { - "id": "5370950a-2a3a-4143-8218-f9b4808099ba" - }, - "source": [ - "## Step-8: Text encoding\n", - "\n", - "Encode text for the vector storage." - ] - }, - { - "cell_type": "markdown", - "id": "85aba685", - "metadata": { - "id": "85aba685" - }, - "source": [ - "### 8.1 - Set Input/output Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "20a153fa-fd56-401e-86be-4f7617affcc8", - "outputId": "530e65c6-7ceb-4c73-cb87-50da46c78add" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" - ] - } - ], - "source": [ - "STAGE = 6\n", + "all_docs_df = read_parquet_files_as_df(output_doc_quality_dir)\n", "\n", - "input_folder = output_fuzzy_dedupe_dir # previous output folder is the input folder for the current stage\n", - "output_folder = output_embeddings_dir\n", + "# remove documents with badwords\n", + "clean_docs_df = all_docs_df[all_docs_df['docq_contain_bad_word'] == False]\n", "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "# also filter out 'lorem ipsum' text\n", + "clean_docs_df = clean_docs_df[clean_docs_df['docq_lorem_ipsum_ratio'] == 0]\n", "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + "clean_docs_df.head(10)" ] }, { "cell_type": "markdown", - "id": "c97545f4", - "metadata": { - "id": "c97545f4" - }, + "id": "5861461a", + "metadata": {}, "source": [ - "### 8.2 - Execute" + "## Step-9: Copy output to final output dir" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 914, - "referenced_widgets": [ - "8b7571c585df431eb901fcdebdf8177e", - "06107a2f48b3491f91bbe84e46e10ba0", - "bd74356eca18423aa0373c808d9097e3", - "7e13e8779a81400f996d4428c74acfaf", - "a75892696be546a3970962bae7bf732a", - "68997339f13240a4824a9e416096bee4", - "919b086abd314077bbff75687392bd91", - "b4c209371e7a403986991a786cfb296d", - "6c08de2dd9a2402c90b1a7a645db9b13", - "91fff81a1de8487c9009e872b751edb0", - "ada62d24cbcf4361acbb21808f334d33" - ] - }, - "id": "228df6b2-bc62-494b-9697-03ece98d7853", - "outputId": "b10eecc1-cd17-49c1-e3b1-b80e0e1bfa86" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:32:37 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "13:32:37 INFO - pipeline id pipeline_id\n", - "13:32:37 INFO - code location None\n", - "13:32:37 INFO - number of workers 2 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:37 INFO - actor creation delay 0\n", - "13:32:37 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:37 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", - "13:32:37 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:37 INFO - Running locally\n", - "2024-10-18 13:32:39,609\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - orchestrator started at 2024-10-18 13:32:42\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of files is 2, source profile {'max_file_size': 0.009654045104980469, 'min_file_size': 0.00907135009765625, 'total_file_size': 0.01872539520263672}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 14.943363189697266, 'object_store': 7.471681594848633}\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Number of workers - 2 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:42 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - Completed processing 2 files in 0.087 min\n", - "\u001b[36m(orchestrate pid=17394)\u001b[0m 13:32:47 INFO - done flushing in 0.001 sec\n", - "13:32:57 INFO - Completed execution in 0.333 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:6 completed successfully\n", - "CPU times: user 607 ms, sys: 226 ms, total: 833 ms\n", - "Wall time: 22.1 s\n" - ] - } - ], + "execution_count": 22, + "id": "8d1b50f7", + "metadata": {}, + "outputs": [], "source": [ - "%%time\n", - "\n", - "from text_encoder_transform_ray import TextEncoderRayTransformConfiguration\n", - "\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # text_encoder\n", - " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", - "}\n", - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "# create launcher\n", - "launcher = RayTransformLauncher(TextEncoderRayTransformConfiguration())\n", - "# Launch the ray actor(s) to process the input\n", + "import shutil\n", "\n", - "return_code = launcher.launch()\n", + "shutil.rmtree(output_final_dir, ignore_errors=True)\n", + "shutil.os.makedirs(output_final_dir, exist_ok=True)\n", "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "b734852c", - "metadata": { - "id": "b734852c" - }, - "source": [ - "### 8.3 - Inspect Generated output\n", + "output_final_dir_parquet = os.path.join (output_final_dir, 'pq')\n", + "shutil.os.makedirs(output_final_dir_parquet, exist_ok=True)\n", "\n", - "You will see a column called `embeddings` added at the end. This the text content converted into vectors or embeddings. We used the model `sentence-transformers/all-MiniLM-L6-v2`" + "output_final_dir_markdown = os.path.join (output_final_dir, 'markdown')\n", + "shutil.os.makedirs(output_final_dir_markdown, exist_ok=True)" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "7b1c1d09", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 659 - }, - "id": "7b1c1d09", - "outputId": "70612634-b336-4ad5-ddb3-782ca0676bae" - }, + "execution_count": 23, + "id": "ba897dd9", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (6, 18)\n", - "Output data dimensions (rows x columns)= (6, 19)\n" + "✅ Saved CLEAN parquet output to 'output/output_final/pq'\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_idchunk_hashembeddings
0mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Solar System\\nOur solar system is a vast and f...$.main-text[2]1[132.84518433, 588.96014404, 479.40917969, 623...44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674...4-1[0.0077404897, -0.020559434, 0.026426662, 0.01...
1mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Mars\\nMars, the fourth planet from the Sun, is...$.main-text[5]1[132.87440491, 500.84011841, 477.48345947, 534...a31663e06fac41470ecc459f5a58658a3f9997d7801053...6-1[0.07728298, 0.024971062, -0.04318075, 0.05809...
2mars.pdf1011pdf8edd5dfbf888777120b528a5d8998f2757d006df0eaef7...28002024-10-18T13:30:59.4900072.011138mars.pdf62e5639f-f922-4ccc-a041-3cb02f1cfd83Basic facts about Mars:\\n· Distance from the S...$.main-text[6]1[133.2026062, 482.90710449, 237.04431152, 493....7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a...7-1[0.1059802, 0.025460616, 0.02362733, 0.0390564...
3earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Solar System\\nFor more details about our Solar...$.main-text[3]1[133.20942688, 570.81555176, 375.57919312, 581...d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d...15[-0.062105577, -0.0053322953, 0.03127779, 0.04...
4earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nEarth is the third planet from the Sun....$.main-text[5]1[132.91053772, 512.46295166, 477.84887695, 534...7c4a750e2215f231803a6f8078bde1e9699034fb033dd3...2-1[0.0724358, -0.058001805, -0.01977186, -0.0243...
5earth.pdf1011pdf18713f970989055625bef22209b6f4b6830b9ca22046bf...26862024-10-18T13:30:59.4940272.015123earth.pdff3c0ac2e-1de2-472b-8216-2043f3b3e9d1Earth\\nBasic facts about Earth:\\n· Distance fr...$.main-text[6]1[133.30151367, 494.86206055, 240.17156982, 505...189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f...3-1[0.091821924, 0.015197907, 0.07716932, 0.01711...
\n", - "
" - ], - "text/plain": [ - " filename num_pages num_tables num_doc_elements ext \\\n", - "0 mars.pdf 1 0 11 pdf \n", - "1 mars.pdf 1 0 11 pdf \n", - "2 mars.pdf 1 0 11 pdf \n", - "3 earth.pdf 1 0 11 pdf \n", - "4 earth.pdf 1 0 11 pdf \n", - "5 earth.pdf 1 0 11 pdf \n", - "\n", - " hash size \\\n", - "0 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "1 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "2 8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... 2800 \n", - "3 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "4 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "5 18713f970989055625bef22209b6f4b6830b9ca22046bf... 2686 \n", - "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "0 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "1 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "2 2024-10-18T13:30:59.490007 2.011138 mars.pdf \n", - "3 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "4 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "5 2024-10-18T13:30:59.494027 2.015123 earth.pdf \n", - "\n", - " source_document_id \\\n", - "0 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "1 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "2 62e5639f-f922-4ccc-a041-3cb02f1cfd83 \n", - "3 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "4 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "5 f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 \n", - "\n", - " contents doc_jsonpath \\\n", - "0 Solar System\\nOur solar system is a vast and f... $.main-text[2] \n", - "1 Mars\\nMars, the fourth planet from the Sun, is... $.main-text[5] \n", - "2 Basic facts about Mars:\\n· Distance from the S... $.main-text[6] \n", - "3 Solar System\\nFor more details about our Solar... $.main-text[3] \n", - "4 Earth\\nEarth is the third planet from the Sun.... $.main-text[5] \n", - "5 Earth\\nBasic facts about Earth:\\n· Distance fr... $.main-text[6] \n", - "\n", - " page_number bbox \\\n", - "0 1 [132.84518433, 588.96014404, 479.40917969, 623... \n", - "1 1 [132.87440491, 500.84011841, 477.48345947, 534... \n", - "2 1 [133.2026062, 482.90710449, 237.04431152, 493.... \n", - "3 1 [133.20942688, 570.81555176, 375.57919312, 581... \n", - "4 1 [132.91053772, 512.46295166, 477.84887695, 534... \n", - "5 1 [133.30151367, 494.86206055, 240.17156982, 505... \n", - "\n", - " document_id chunk_id chunk_hash \\\n", - "0 44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... 4 -1 \n", - "1 a31663e06fac41470ecc459f5a58658a3f9997d7801053... 6 -1 \n", - "2 7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... 7 -1 \n", - "3 d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... 1 5 \n", - "4 7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... 2 -1 \n", - "5 189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... 3 -1 \n", - "\n", - " embeddings \n", - "0 [0.0077404897, -0.020559434, 0.026426662, 0.01... \n", - "1 [0.07728298, 0.024971062, -0.04318075, 0.05809... \n", - "2 [0.1059802, 0.025460616, 0.02362733, 0.0390564... \n", - "3 [-0.062105577, -0.0053322953, 0.03127779, 0.04... \n", - "4 [0.0724358, -0.058001805, -0.01977186, -0.0243... \n", - "5 [0.091821924, 0.015197907, 0.07716932, 0.01711... " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "from my_utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "## save parquet\n", "\n", - "output_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "f5e12630-be6b-4188-a925-77117155617b", - "metadata": { - "id": "f5e12630-be6b-4188-a925-77117155617b" - }, - "source": [ - "## Step-9: Copy output to final output dir" + "clean_docs_df.to_parquet(os.path.join(output_final_dir_parquet, \"clean_docs.parquet\"))\n", + "print (f\"✅ Saved CLEAN parquet output to '{output_final_dir_parquet}'\")" ] }, { "cell_type": "code", - "execution_count": 34, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", - "outputId": "d151e618-6528-40b5-fdbd-1c67291a7279" - }, + "execution_count": 24, + "id": "867bb0f7", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "✅ Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" + "✅ Saved CLEAN markdown output to 'output/output_final/markdown'\n" ] } ], "source": [ - "import shutil\n", + "## save markdown text\n", "\n", - "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", - "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", + "for index, row in clean_docs_df.iterrows():\n", + " output_file_name = os.path.join (output_final_dir_markdown, row['filename'] + '.md')\n", + " with open(output_file_name, 'w') as output_file:\n", + " output_file.write(row['contents'])\n", "\n", - "print (f\"✅ Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + "print (f\"✅ Saved CLEAN markdown output to '{output_final_dir_markdown}'\")\n" ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "dc0a6728", - "metadata": { - "id": "dc0a6728" - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -3991,7 +2534,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "dpk-3-basic-022dev1-py311", + "display_name": "dpk-6-pdf-processing-r1.0.0-all-py3.11", "language": "python", "name": "python3" }, @@ -4005,7 +2548,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.11.11" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/notebooks/pdf-processing-1/requirements.txt b/examples/notebooks/pdf-processing-1/requirements.txt new file mode 100644 index 000000000..ffd42dafe --- /dev/null +++ b/examples/notebooks/pdf-processing-1/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit-transforms[ray,all]==1.0.0 + +# jupyter +jupyterlab +ipykernel +ipywidgets From 4662fa93cbc2dc65cd0ff32695cb36636c6ed411 Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Tue, 4 Feb 2025 22:45:50 -0800 Subject: [PATCH 5/6] Refactoring data files into 'examples/data-files' Signed-off-by: Sujee Maniyam --- .../data-files/pdf-processing-1/README.md | 11 + .../pdf-processing-1}/earth-copy.pdf | Bin .../pdf-processing-1}/earth.md | 0 .../pdf-processing-1}/earth.pdf | Bin .../pdf-processing-1}/earth2.md | 0 .../pdf-processing-1}/earth2.pdf | Bin .../pdf-processing-1}/lorem-ipsum.md | 0 .../pdf-processing-1}/lorem-ipsum.pdf | Bin .../pdf-processing-1}/mars.md | 0 .../pdf-processing-1}/mars.pdf | Bin .../pdf-processing-1}/spam.md | 0 .../pdf-processing-1}/spam.pdf | Bin examples/notebooks/pdf-processing-1/README.md | 19 +- .../pdf_processing_1_python.ipynb | 626 +++++++------- .../pdf_processing_1_ray.ipynb | 790 +++++++++--------- 15 files changed, 725 insertions(+), 721 deletions(-) create mode 100644 examples/data-files/pdf-processing-1/README.md rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/earth-copy.pdf (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/earth.md (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/earth.pdf (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/earth2.md (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/earth2.pdf (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/lorem-ipsum.md (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/lorem-ipsum.pdf (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/mars.md (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/mars.pdf (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/spam.md (100%) rename examples/{notebooks/pdf-processing-1/input => data-files/pdf-processing-1}/spam.pdf (100%) diff --git a/examples/data-files/pdf-processing-1/README.md b/examples/data-files/pdf-processing-1/README.md new file mode 100644 index 000000000..e81e80ee8 --- /dev/null +++ b/examples/data-files/pdf-processing-1/README.md @@ -0,0 +1,11 @@ +## Creating Input PDFs (Optional) + +Sample PDFs we use for this example are created from markdown documents using pandoc utility, as follows. + +```bash +pandoc earth.md -o earth.pdf +pandoc earth2.md -o earth2.pdf +pandoc mars.md -o mars.pdf +pandoc spam.md -o spam.pdf +pandoc lorem-ipsum.md -o lorem-ipsum.pdf +``` \ No newline at end of file diff --git a/examples/notebooks/pdf-processing-1/input/earth-copy.pdf b/examples/data-files/pdf-processing-1/earth-copy.pdf similarity index 100% rename from examples/notebooks/pdf-processing-1/input/earth-copy.pdf rename to examples/data-files/pdf-processing-1/earth-copy.pdf diff --git a/examples/notebooks/pdf-processing-1/input/earth.md b/examples/data-files/pdf-processing-1/earth.md similarity index 100% rename from examples/notebooks/pdf-processing-1/input/earth.md rename to examples/data-files/pdf-processing-1/earth.md diff --git a/examples/notebooks/pdf-processing-1/input/earth.pdf b/examples/data-files/pdf-processing-1/earth.pdf similarity index 100% rename from examples/notebooks/pdf-processing-1/input/earth.pdf rename to examples/data-files/pdf-processing-1/earth.pdf diff --git a/examples/notebooks/pdf-processing-1/input/earth2.md b/examples/data-files/pdf-processing-1/earth2.md similarity index 100% rename from examples/notebooks/pdf-processing-1/input/earth2.md rename to examples/data-files/pdf-processing-1/earth2.md diff --git a/examples/notebooks/pdf-processing-1/input/earth2.pdf b/examples/data-files/pdf-processing-1/earth2.pdf similarity index 100% rename from examples/notebooks/pdf-processing-1/input/earth2.pdf rename to examples/data-files/pdf-processing-1/earth2.pdf diff --git a/examples/notebooks/pdf-processing-1/input/lorem-ipsum.md b/examples/data-files/pdf-processing-1/lorem-ipsum.md similarity index 100% rename from examples/notebooks/pdf-processing-1/input/lorem-ipsum.md rename to examples/data-files/pdf-processing-1/lorem-ipsum.md diff --git a/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf b/examples/data-files/pdf-processing-1/lorem-ipsum.pdf similarity index 100% rename from examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf rename to examples/data-files/pdf-processing-1/lorem-ipsum.pdf diff --git a/examples/notebooks/pdf-processing-1/input/mars.md b/examples/data-files/pdf-processing-1/mars.md similarity index 100% rename from examples/notebooks/pdf-processing-1/input/mars.md rename to examples/data-files/pdf-processing-1/mars.md diff --git a/examples/notebooks/pdf-processing-1/input/mars.pdf b/examples/data-files/pdf-processing-1/mars.pdf similarity index 100% rename from examples/notebooks/pdf-processing-1/input/mars.pdf rename to examples/data-files/pdf-processing-1/mars.pdf diff --git a/examples/notebooks/pdf-processing-1/input/spam.md b/examples/data-files/pdf-processing-1/spam.md similarity index 100% rename from examples/notebooks/pdf-processing-1/input/spam.md rename to examples/data-files/pdf-processing-1/spam.md diff --git a/examples/notebooks/pdf-processing-1/input/spam.pdf b/examples/data-files/pdf-processing-1/spam.pdf similarity index 100% rename from examples/notebooks/pdf-processing-1/input/spam.pdf rename to examples/data-files/pdf-processing-1/spam.pdf diff --git a/examples/notebooks/pdf-processing-1/README.md b/examples/notebooks/pdf-processing-1/README.md index 70337476f..c7fdf8ffb 100644 --- a/examples/notebooks/pdf-processing-1/README.md +++ b/examples/notebooks/pdf-processing-1/README.md @@ -31,8 +31,11 @@ pip3 install -r requirements.txt jupyter lab ``` -## Running the code +## Data Files + +PDF files are located in [examples/data-files/pdf-processing-1](../../data-files/pdf-processing-1/) +## Running the code [python version](pdf_processing_1_python.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sujee/data-prep-kit/blob/process-pdf-1/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb) @@ -48,17 +51,3 @@ python -m ipykernel install --user --name=data-prep-kit --display-name "dataprep ``` -## Creating Input PDFs (Optional) - -Sample PDFs we use for this example are created from markdown documents using pandoc utility, as follows. - -```bash -cd input - -pandoc earth.md -o earth.pdf -pandoc earth2.md -o earth2.pdf -pandoc mars.md -o mars.pdf -pandoc spam.md -o spam.pdf -pandoc lorem-ipsum.md -o lorem-ipsum.pdf -``` - diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb index 90a09cfe6..e4f9aa713 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb @@ -212,8 +212,12 @@ "import os, sys\n", "import shutil\n", "\n", - "input_dir = \"input\"\n", - "shutil.os.makedirs(input_dir, exist_ok=True)\n", + "if RUNNING_IN_COLAB:\n", + " input_dir = \"input\"\n", + " shutil.os.makedirs(input_dir, exist_ok=True)\n", + "else:\n", + " input_dir = \"../../data-files/pdf-processing-1/\"\n", + " \n", "output_dir = \"output\"\n", "\n", "output_pdf2pq_dir = os.path.join (output_dir, '01_pdf2pq_out')\n", @@ -351,28 +355,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "Local file 'input/earth.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/earth-copy.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/earth2.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/mars.pdf' (57.87 KB) already exists. Skipping download.\n", - "Local file 'input/spam.pdf' (24.87 KB) already exists. Skipping download.\n", - "Local file 'input/lorem-ipsum.pdf' (25.72 KB) already exists. Skipping download.\n" + "Using input files from : ../../data-files/pdf-processing-1/\n" ] } ], "source": [ + "if RUNNING_IN_COLAB:\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))" + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + "else:\n", + " print ('Using input files from : ', input_dir)" ] }, { @@ -431,7 +433,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_pdf2pq_out'\n", + "🏃🏼 STAGE-1: Processing input='../../data-files/pdf-processing-1/' --> output='output/01_pdf2pq_out'\n", "\n" ] }, @@ -439,21 +441,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:27:11 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "11:27:11 INFO - pipeline id pipeline_id\n", - "11:27:11 INFO - code location None\n", - "11:27:11 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_pdf2pq_out\n", - "11:27:11 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:11 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "11:27:11 INFO - orchestrator pdf2parquet started at 2025-01-29 11:27:11\n", - "11:27:11 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "11:27:11 INFO - Initializing models\n" + "22:09:52 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "22:09:52 INFO - pipeline id pipeline_id\n", + "22:09:52 INFO - code location None\n", + "22:09:52 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "22:09:52 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:09:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "22:09:52 INFO - orchestrator pdf2parquet started at 2025-02-04 22:09:52\n", + "22:09:52 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "22:09:52 INFO - Initializing models\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1e7c9a2ba2a841a0b84db84b3d60974e", + "model_id": "730bac430d12421f9c300c4ee68b11ca", "version_major": 2, "version_minor": 0 }, @@ -468,15 +470,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:27:18 INFO - Completed 1 files (16.67%) in 0.019 min\n", - "11:27:19 INFO - Completed 2 files (33.33%) in 0.034 min\n", - "11:27:19 INFO - Completed 3 files (50.0%) in 0.045 min\n", - "11:27:20 INFO - Completed 4 files (66.67%) in 0.055 min\n", - "11:27:21 INFO - Completed 5 files (83.33%) in 0.066 min\n", - "11:27:21 INFO - Completed 6 files (100.0%) in 0.078 min\n", - "11:27:21 INFO - Done processing 6 files, waiting for flush() completion.\n", - "11:27:21 INFO - done flushing in 0.0 sec\n", - "11:27:21 INFO - Completed execution in 0.168 min, execution result 0\n" + "22:09:57 INFO - Completed 1 files (16.67%) in 0.019 min\n", + "22:09:58 INFO - Completed 2 files (33.33%) in 0.034 min\n", + "22:09:58 INFO - Completed 3 files (50.0%) in 0.044 min\n", + "22:09:59 INFO - Completed 4 files (66.67%) in 0.054 min\n", + "22:10:00 INFO - Completed 5 files (83.33%) in 0.065 min\n", + "22:10:00 INFO - Completed 6 files (100.0%) in 0.076 min\n", + "22:10:00 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:10:00 INFO - done flushing in 0.0 sec\n", + "22:10:00 INFO - Completed execution in 0.139 min, execution result 0\n" ] }, { @@ -484,8 +486,8 @@ "output_type": "stream", "text": [ "✅ Stage:1 completed successfully\n", - "CPU times: user 21.5 s, sys: 2.22 s, total: 23.7 s\n", - "Wall time: 14.1 s\n" + "CPU times: user 20.8 s, sys: 2.35 s, total: 23.1 s\n", + "Wall time: 12.4 s\n" ] } ], @@ -586,13 +588,13 @@ " 1\n", " 0\n", " 2\n", - " 5f1e1408-481a-4463-b7d1-7048da581607\n", + " da0c5fcb-38a7-461e-965d-d76b8b759190\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:27:20.388498\n", - " 0.628371\n", + " 2025-02-04T22:09:59.393899\n", + " 0.611141\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -602,13 +604,13 @@ " 1\n", " 0\n", " 2\n", - " dc0531e1-bc12-4919-8e27-13763592280e\n", + " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:27:21.754298\n", - " 0.730394\n", + " 2025-02-04T22:10:00.665091\n", + " 0.609956\n", " spam.pdf\n", " \n", " \n", @@ -618,13 +620,13 @@ " 1\n", " 0\n", " 11\n", - " 74eaf9f3-716d-43c6-9cbc-b1497454d33b\n", + " 281e5825-705a-4710-941c-82ce3628dfd9\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:27:19.749728\n", - " 0.643720\n", + " 2025-02-04T22:09:58.781029\n", + " 0.619509\n", " earth2.pdf\n", " \n", " \n", @@ -634,13 +636,13 @@ " 1\n", " 0\n", " 11\n", - " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", + " d51af835-96b4-4ba9-b77e-543d964f2d30\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:27:21.013529\n", - " 0.614578\n", + " 2025-02-04T22:10:00.053520\n", + " 0.657007\n", " mars.pdf\n", " \n", " \n", @@ -650,13 +652,13 @@ " 1\n", " 0\n", " 11\n", - " 523f8ace-a61e-4f27-9970-84581ba6626a\n", + " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:27:18.200289\n", - " 1.119966\n", + " 2025-02-04T22:09:57.243139\n", + " 1.110305\n", " earth-copy.pdf\n", " \n", " \n", @@ -666,13 +668,13 @@ " 1\n", " 0\n", " 11\n", - " a41f26ae-fec2-43e6-8225-36bd944b4684\n", + " f75a8fad-c39a-42bd-9c47-a0975312bc5d\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:27:19.098169\n", - " 0.881966\n", + " 2025-02-04T22:09:58.159736\n", + " 0.902815\n", " earth.pdf\n", " \n", " \n", @@ -697,12 +699,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", - "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", - "2 74eaf9f3-716d-43c6-9cbc-b1497454d33b 10729312978404042321 pdf \n", - "3 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", - "4 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", - "5 a41f26ae-fec2-43e6-8225-36bd944b4684 14711865278795535908 pdf \n", + "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", + "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", + "2 281e5825-705a-4710-941c-82ce3628dfd9 10729312978404042321 pdf \n", + "3 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", + "4 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", + "5 f75a8fad-c39a-42bd-9c47-a0975312bc5d 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -713,12 +715,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", - "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", - "2 2025-01-29T11:27:19.749728 0.643720 earth2.pdf \n", - "3 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", - "4 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", - "5 2025-01-29T11:27:19.098169 0.881966 earth.pdf " + "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", + "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", + "2 2025-02-04T22:09:58.781029 0.619509 earth2.pdf \n", + "3 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", + "4 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", + "5 2025-02-04T22:09:58.159736 0.902815 earth.pdf " ] }, "execution_count": 8, @@ -895,23 +897,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:27:21 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "11:27:21 INFO - pipeline id pipeline_id\n", - "11:27:21 INFO - code location None\n", - "11:27:21 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", - "11:27:21 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:21 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:21 INFO - orchestrator doc_id started at 2025-01-29 11:27:21\n", - "11:27:21 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "11:27:21 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "11:27:21 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "11:27:21 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "11:27:21 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "11:27:21 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "11:27:21 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "11:27:21 INFO - Done processing 6 files, waiting for flush() completion.\n", - "11:27:21 INFO - done flushing in 0.0 sec\n", - "11:27:21 INFO - Completed execution in 0.0 min, execution result 0\n" + "22:10:00 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "22:10:00 INFO - pipeline id pipeline_id\n", + "22:10:00 INFO - code location None\n", + "22:10:00 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "22:10:00 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:00 INFO - orchestrator doc_id started at 2025-02-04 22:10:00\n", + "22:10:00 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "22:10:00 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:10:00 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:10:00 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:10:00 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:10:00 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:10:00 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:10:00 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:10:00 INFO - done flushing in 0.0 sec\n", + "22:10:00 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -919,8 +921,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 28 ms, sys: 2.28 ms, total: 30.3 ms\n", - "Wall time: 25.8 ms\n" + "CPU times: user 28.4 ms, sys: 2.92 ms, total: 31.3 ms\n", + "Wall time: 26.8 ms\n" ] } ], @@ -1025,13 +1027,13 @@ " 1\n", " 0\n", " 2\n", - " 5f1e1408-481a-4463-b7d1-7048da581607\n", + " da0c5fcb-38a7-461e-965d-d76b8b759190\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:27:20.388498\n", - " 0.628371\n", + " 2025-02-04T22:09:59.393899\n", + " 0.611141\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1043,13 +1045,13 @@ " 1\n", " 0\n", " 2\n", - " dc0531e1-bc12-4919-8e27-13763592280e\n", + " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:27:21.754298\n", - " 0.730394\n", + " 2025-02-04T22:10:00.665091\n", + " 0.609956\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1061,13 +1063,13 @@ " 1\n", " 0\n", " 11\n", - " 74eaf9f3-716d-43c6-9cbc-b1497454d33b\n", + " 281e5825-705a-4710-941c-82ce3628dfd9\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:27:19.749728\n", - " 0.643720\n", + " 2025-02-04T22:09:58.781029\n", + " 0.619509\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1079,13 +1081,13 @@ " 1\n", " 0\n", " 11\n", - " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", + " d51af835-96b4-4ba9-b77e-543d964f2d30\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:27:21.013529\n", - " 0.614578\n", + " 2025-02-04T22:10:00.053520\n", + " 0.657007\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1097,13 +1099,13 @@ " 1\n", " 0\n", " 11\n", - " 523f8ace-a61e-4f27-9970-84581ba6626a\n", + " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:27:18.200289\n", - " 1.119966\n", + " 2025-02-04T22:09:57.243139\n", + " 1.110305\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1115,13 +1117,13 @@ " 1\n", " 0\n", " 11\n", - " a41f26ae-fec2-43e6-8225-36bd944b4684\n", + " f75a8fad-c39a-42bd-9c47-a0975312bc5d\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:27:19.098169\n", - " 0.881966\n", + " 2025-02-04T22:09:58.159736\n", + " 0.902815\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 1\n", @@ -1148,12 +1150,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", - "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", - "2 74eaf9f3-716d-43c6-9cbc-b1497454d33b 10729312978404042321 pdf \n", - "3 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", - "4 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", - "5 a41f26ae-fec2-43e6-8225-36bd944b4684 14711865278795535908 pdf \n", + "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", + "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", + "2 281e5825-705a-4710-941c-82ce3628dfd9 10729312978404042321 pdf \n", + "3 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", + "4 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", + "5 f75a8fad-c39a-42bd-9c47-a0975312bc5d 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1164,12 +1166,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", - "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", - "2 2025-01-29T11:27:19.749728 0.643720 earth2.pdf \n", - "3 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", - "4 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", - "5 2025-01-29T11:27:19.098169 0.881966 earth.pdf \n", + "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", + "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", + "2 2025-02-04T22:09:58.781029 0.619509 earth2.pdf \n", + "3 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", + "4 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", + "5 2025-02-04T22:09:58.159736 0.902815 earth.pdf \n", "\n", " doc_hash int_id_column \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", @@ -1241,24 +1243,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:27:21 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "11:27:21 INFO - pipeline id pipeline_id\n", - "11:27:21 INFO - code location None\n", - "11:27:21 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "11:27:21 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:21 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:21 INFO - orchestrator ededup started at 2025-01-29 11:27:21\n", - "11:27:21 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "11:27:21 INFO - Starting from the beginning\n", - "11:27:21 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "11:27:21 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "11:27:21 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "11:27:21 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "11:27:21 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "11:27:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "11:27:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "11:27:22 INFO - done flushing in 0.0 sec\n", - "11:27:22 INFO - Completed execution in 0.0 min, execution result 0\n" + "22:10:00 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "22:10:00 INFO - pipeline id pipeline_id\n", + "22:10:00 INFO - code location None\n", + "22:10:00 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "22:10:00 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:00 INFO - orchestrator ededup started at 2025-02-04 22:10:00\n", + "22:10:00 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "22:10:00 INFO - Starting from the beginning\n", + "22:10:00 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:10:00 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:10:00 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:10:00 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:10:00 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:10:00 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:10:00 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:10:00 INFO - done flushing in 0.0 sec\n", + "22:10:00 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1266,8 +1268,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 34.4 ms, sys: 2.97 ms, total: 37.3 ms\n", - "Wall time: 31.2 ms\n" + "CPU times: user 26.8 ms, sys: 11.8 ms, total: 38.6 ms\n", + "Wall time: 32.8 ms\n" ] } ], @@ -1373,13 +1375,13 @@ " 1\n", " 0\n", " 2\n", - " 5f1e1408-481a-4463-b7d1-7048da581607\n", + " da0c5fcb-38a7-461e-965d-d76b8b759190\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:27:20.388498\n", - " 0.628371\n", + " 2025-02-04T22:09:59.393899\n", + " 0.611141\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1392,13 +1394,13 @@ " 1\n", " 0\n", " 2\n", - " dc0531e1-bc12-4919-8e27-13763592280e\n", + " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:27:21.754298\n", - " 0.730394\n", + " 2025-02-04T22:10:00.665091\n", + " 0.609956\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1411,13 +1413,13 @@ " 1\n", " 0\n", " 11\n", - " 74eaf9f3-716d-43c6-9cbc-b1497454d33b\n", + " 281e5825-705a-4710-941c-82ce3628dfd9\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:27:19.749728\n", - " 0.643720\n", + " 2025-02-04T22:09:58.781029\n", + " 0.619509\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1430,13 +1432,13 @@ " 1\n", " 0\n", " 11\n", - " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", + " d51af835-96b4-4ba9-b77e-543d964f2d30\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:27:21.013529\n", - " 0.614578\n", + " 2025-02-04T22:10:00.053520\n", + " 0.657007\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1449,13 +1451,13 @@ " 1\n", " 0\n", " 11\n", - " 523f8ace-a61e-4f27-9970-84581ba6626a\n", + " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:27:18.200289\n", - " 1.119966\n", + " 2025-02-04T22:09:57.243139\n", + " 1.110305\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1481,11 +1483,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", - "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", - "2 74eaf9f3-716d-43c6-9cbc-b1497454d33b 10729312978404042321 pdf \n", - "3 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", - "4 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", + "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", + "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", + "2 281e5825-705a-4710-941c-82ce3628dfd9 10729312978404042321 pdf \n", + "3 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", + "4 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1495,11 +1497,11 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", - "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", - "2 2025-01-29T11:27:19.749728 0.643720 earth2.pdf \n", - "3 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", - "4 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", + "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", + "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", + "2 2025-02-04T22:09:58.781029 0.619509 earth2.pdf \n", + "3 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", + "4 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1594,109 +1596,109 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:27:22 INFO - Starting SignatureCalculation step\n", - "11:27:22 INFO - Got parameters for SignatureCalculation\n", - "11:27:22 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "11:27:22 INFO - data factory scdata_ is using local configuration without input/output path\n", - "11:27:22 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - pipeline id pipeline_id\n", - "11:27:22 INFO - code location None\n", - "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - orchestrator minhash started at 2025-01-29 11:27:22\n", - "11:27:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "11:27:22 INFO - Completed 1 files (16.67%) in 0.001 min\n", - "11:27:22 WARNING - table is empty, skipping processing\n", - "11:27:22 INFO - Completed 2 files (33.33%) in 0.001 min\n", - "11:27:22 INFO - Completed 3 files (50.0%) in 0.001 min\n", - "11:27:22 INFO - Completed 4 files (66.67%) in 0.001 min\n", - "11:27:22 INFO - Completed 5 files (83.33%) in 0.001 min\n", - "11:27:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", - "11:27:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "11:27:22 INFO - Starting flush()\n", - "11:27:22 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", - "11:27:22 INFO - done flushing in 0.028 sec\n", - "11:27:22 INFO - Completed execution in 0.001 min, execution result 0\n", - "11:27:22 INFO - SignatureCalculation completed successfully\n", - "11:27:22 INFO - Starting ClusterAnalysis step\n", - "11:27:22 INFO - Got parameters for ClusterAnalysis\n", - "11:27:22 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "11:27:22 INFO - pipeline id pipeline_id\n", - "11:27:22 INFO - code location None\n", - "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - orchestrator cluster started at 2025-01-29 11:27:22\n", - "11:27:22 INFO - Number of folders is 14\n", - "11:27:22 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "11:27:22 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "11:27:22 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "11:27:22 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "11:27:22 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "11:27:22 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "11:27:22 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "11:27:22 INFO - Completed 8 files (57.14%) in 0.001 min\n", - "11:27:22 INFO - Completed 9 files (64.29%) in 0.001 min\n", - "11:27:22 INFO - Completed 10 files (71.43%) in 0.001 min\n", - "11:27:22 INFO - Completed 11 files (78.57%) in 0.001 min\n", - "11:27:22 INFO - Completed 12 files (85.71%) in 0.001 min\n", - "11:27:22 INFO - Completed 13 files (92.86%) in 0.001 min\n", - "11:27:22 INFO - Completed 14 files (100.0%) in 0.001 min\n", - "11:27:22 INFO - Done processing 14 files, waiting for flush() completion.\n", - "11:27:22 INFO - done flushing in 0.0 sec\n", - "11:27:22 INFO - Completed execution in 0.001 min, execution result 0\n", - "11:27:22 INFO - ClusterAnalysis completed successfully\n", - "11:27:22 INFO - Starting GetDuplicateList step\n", - "11:27:22 INFO - Got parameters for GetDuplicateList\n", - "11:27:22 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "11:27:22 INFO - pipeline id pipeline_id\n", - "11:27:22 INFO - code location None\n", - "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - orchestrator fdlist started at 2025-01-29 11:27:22\n", - "11:27:22 INFO - Number of folders is 1\n", - "11:27:22 INFO - Get Duplicate List for folder docs_to_remove\n", - "11:27:22 INFO - 1 documents marked as duplicates\n", - "11:27:22 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "11:27:22 INFO - Done processing 1 files, waiting for flush() completion.\n", - "11:27:22 INFO - done flushing in 0.0 sec\n", - "11:27:22 INFO - Completed execution in 0.0 min, execution result 0\n", - "11:27:22 INFO - GetDuplicateList completed successfully\n", - "11:27:22 INFO - Starting DataCleaning step\n", - "11:27:22 INFO - Got parameters for DataCleaning\n", - "11:27:22 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "11:27:22 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "11:27:22 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - pipeline id pipeline_id\n", - "11:27:22 INFO - code location None\n", - "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - orchestrator fdclean started at 2025-01-29 11:27:22\n", - "11:27:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "11:27:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "11:27:22 WARNING - table is empty, skipping processing\n", - "11:27:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "11:27:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "11:27:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "11:27:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "11:27:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "11:27:22 INFO - Done processing 6 files, waiting for flush() completion.\n", - "11:27:22 INFO - done flushing in 0.0 sec\n", - "11:27:22 INFO - Completed execution in 0.001 min, execution result 0\n", - "11:27:22 INFO - DataCleaning completed successfully\n" + "22:10:01 INFO - Starting SignatureCalculation step\n", + "22:10:01 INFO - Got parameters for SignatureCalculation\n", + "22:10:01 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "22:10:01 INFO - data factory scdata_ is using local configuration without input/output path\n", + "22:10:01 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - pipeline id pipeline_id\n", + "22:10:01 INFO - code location None\n", + "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - orchestrator minhash started at 2025-02-04 22:10:01\n", + "22:10:01 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "22:10:01 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:10:01 WARNING - table is empty, skipping processing\n", + "22:10:01 INFO - Completed 2 files (33.33%) in 0.001 min\n", + "22:10:01 INFO - Completed 3 files (50.0%) in 0.001 min\n", + "22:10:01 INFO - Completed 4 files (66.67%) in 0.001 min\n", + "22:10:01 INFO - Completed 5 files (83.33%) in 0.001 min\n", + "22:10:01 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "22:10:01 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:10:01 INFO - Starting flush()\n", + "22:10:01 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "22:10:01 INFO - done flushing in 0.028 sec\n", + "22:10:01 INFO - Completed execution in 0.001 min, execution result 0\n", + "22:10:01 INFO - SignatureCalculation completed successfully\n", + "22:10:01 INFO - Starting ClusterAnalysis step\n", + "22:10:01 INFO - Got parameters for ClusterAnalysis\n", + "22:10:01 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "22:10:01 INFO - pipeline id pipeline_id\n", + "22:10:01 INFO - code location None\n", + "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - orchestrator cluster started at 2025-02-04 22:10:01\n", + "22:10:01 INFO - Number of folders is 14\n", + "22:10:01 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "22:10:01 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "22:10:01 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "22:10:01 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "22:10:01 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "22:10:01 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "22:10:01 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "22:10:01 INFO - Completed 8 files (57.14%) in 0.001 min\n", + "22:10:01 INFO - Completed 9 files (64.29%) in 0.001 min\n", + "22:10:01 INFO - Completed 10 files (71.43%) in 0.001 min\n", + "22:10:01 INFO - Completed 11 files (78.57%) in 0.001 min\n", + "22:10:01 INFO - Completed 12 files (85.71%) in 0.001 min\n", + "22:10:01 INFO - Completed 13 files (92.86%) in 0.001 min\n", + "22:10:01 INFO - Completed 14 files (100.0%) in 0.001 min\n", + "22:10:01 INFO - Done processing 14 files, waiting for flush() completion.\n", + "22:10:01 INFO - done flushing in 0.0 sec\n", + "22:10:01 INFO - Completed execution in 0.001 min, execution result 0\n", + "22:10:01 INFO - ClusterAnalysis completed successfully\n", + "22:10:01 INFO - Starting GetDuplicateList step\n", + "22:10:01 INFO - Got parameters for GetDuplicateList\n", + "22:10:01 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "22:10:01 INFO - pipeline id pipeline_id\n", + "22:10:01 INFO - code location None\n", + "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - orchestrator fdlist started at 2025-02-04 22:10:01\n", + "22:10:01 INFO - Number of folders is 1\n", + "22:10:01 INFO - Get Duplicate List for folder docs_to_remove\n", + "22:10:01 INFO - 1 documents marked as duplicates\n", + "22:10:01 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "22:10:01 INFO - Done processing 1 files, waiting for flush() completion.\n", + "22:10:01 INFO - done flushing in 0.0 sec\n", + "22:10:01 INFO - Completed execution in 0.0 min, execution result 0\n", + "22:10:01 INFO - GetDuplicateList completed successfully\n", + "22:10:01 INFO - Starting DataCleaning step\n", + "22:10:01 INFO - Got parameters for DataCleaning\n", + "22:10:01 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "22:10:01 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "22:10:01 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - pipeline id pipeline_id\n", + "22:10:01 INFO - code location None\n", + "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - orchestrator fdclean started at 2025-02-04 22:10:01\n", + "22:10:01 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "22:10:01 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "22:10:01 WARNING - table is empty, skipping processing\n", + "22:10:01 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "22:10:01 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "22:10:01 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "22:10:01 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "22:10:01 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "22:10:01 INFO - Done processing 6 files, waiting for flush() completion.\n", + "22:10:01 INFO - done flushing in 0.0 sec\n", + "22:10:01 INFO - Completed execution in 0.0 min, execution result 0\n", + "22:10:01 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 223 ms, sys: 99.6 ms, total: 322 ms\n", - "Wall time: 282 ms\n" + "CPU times: user 232 ms, sys: 98.5 ms, total: 331 ms\n", + "Wall time: 277 ms\n" ] } ], @@ -1810,13 +1812,13 @@ " 1\n", " 0\n", " 2\n", - " 5f1e1408-481a-4463-b7d1-7048da581607\n", + " da0c5fcb-38a7-461e-965d-d76b8b759190\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:27:20.388498\n", - " 0.628371\n", + " 2025-02-04T22:09:59.393899\n", + " 0.611141\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1829,13 +1831,13 @@ " 1\n", " 0\n", " 2\n", - " dc0531e1-bc12-4919-8e27-13763592280e\n", + " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:27:21.754298\n", - " 0.730394\n", + " 2025-02-04T22:10:00.665091\n", + " 0.609956\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1848,13 +1850,13 @@ " 1\n", " 0\n", " 11\n", - " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", + " d51af835-96b4-4ba9-b77e-543d964f2d30\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:27:21.013529\n", - " 0.614578\n", + " 2025-02-04T22:10:00.053520\n", + " 0.657007\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1867,13 +1869,13 @@ " 1\n", " 0\n", " 11\n", - " 523f8ace-a61e-4f27-9970-84581ba6626a\n", + " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:27:18.200289\n", - " 1.119966\n", + " 2025-02-04T22:09:57.243139\n", + " 1.110305\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1897,10 +1899,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", - "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", - "2 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", - "3 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", + "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", + "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", + "2 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", + "3 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1909,10 +1911,10 @@ "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-29T11:27:20.388498 0.628371 lorem-ipsum.pdf \n", - "1 2025-01-29T11:27:21.754298 0.730394 spam.pdf \n", - "2 2025-01-29T11:27:21.013529 0.614578 mars.pdf \n", - "3 2025-01-29T11:27:18.200289 1.119966 earth-copy.pdf \n", + "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", + "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", + "2 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", + "3 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1992,27 +1994,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:27:22 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "11:27:22 INFO - data factory docq_ is using local configuration without input/output path\n", - "11:27:22 INFO - data factory docq_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - pipeline id pipeline_id\n", - "11:27:22 INFO - code location None\n", - "11:27:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "11:27:22 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:27:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:27:22 INFO - orchestrator docq started at 2025-01-29 11:27:22\n", - "11:27:22 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", - "11:27:22 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "11:27:22 INFO - Completed 1 files (20.0%) in 0.0 min\n", - "11:27:22 WARNING - table is empty, skipping processing\n", - "11:27:22 INFO - Completed 2 files (40.0%) in 0.0 min\n", - "11:27:22 INFO - Completed 3 files (60.0%) in 0.0 min\n", - "11:27:22 INFO - Completed 4 files (80.0%) in 0.0 min\n", - "11:27:22 INFO - Completed 5 files (100.0%) in 0.0 min\n", - "11:27:22 INFO - Done processing 5 files, waiting for flush() completion.\n", - "11:27:22 INFO - done flushing in 0.0 sec\n", - "11:27:22 INFO - Completed execution in 0.0 min, execution result 0\n" + "22:10:01 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "22:10:01 INFO - data factory docq_ is using local configuration without input/output path\n", + "22:10:01 INFO - data factory docq_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - pipeline id pipeline_id\n", + "22:10:01 INFO - code location None\n", + "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:10:01 INFO - orchestrator docq started at 2025-02-04 22:10:01\n", + "22:10:01 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "22:10:01 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "22:10:01 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "22:10:01 WARNING - table is empty, skipping processing\n", + "22:10:01 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "22:10:01 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "22:10:01 INFO - Completed 4 files (80.0%) in 0.001 min\n", + "22:10:01 INFO - Completed 5 files (100.0%) in 0.001 min\n", + "22:10:01 INFO - Done processing 5 files, waiting for flush() completion.\n", + "22:10:01 INFO - done flushing in 0.0 sec\n", + "22:10:01 INFO - Completed execution in 0.001 min, execution result 0\n" ] }, { @@ -2020,8 +2022,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 34.5 ms, sys: 5.21 ms, total: 39.7 ms\n", - "Wall time: 35.4 ms\n" + "CPU times: user 54.2 ms, sys: 4.18 ms, total: 58.3 ms\n", + "Wall time: 53 ms\n" ] } ], @@ -2136,7 +2138,7 @@ " 1\n", " 0\n", " 2\n", - " 5f1e1408-481a-4463-b7d1-7048da581607\n", + " da0c5fcb-38a7-461e-965d-d76b8b759190\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2160,7 +2162,7 @@ " 1\n", " 0\n", " 2\n", - " dc0531e1-bc12-4919-8e27-13763592280e\n", + " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2184,7 +2186,7 @@ " 1\n", " 0\n", " 11\n", - " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", + " d51af835-96b4-4ba9-b77e-543d964f2d30\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2208,7 +2210,7 @@ " 1\n", " 0\n", " 11\n", - " 523f8ace-a61e-4f27-9970-84581ba6626a\n", + " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2244,10 +2246,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 5f1e1408-481a-4463-b7d1-7048da581607 6571294142213095721 pdf \n", - "1 dc0531e1-bc12-4919-8e27-13763592280e 10026122586747302274 pdf \n", - "2 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", - "3 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", + "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", + "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", + "2 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", + "3 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2374,7 +2376,7 @@ " 1\n", " 0\n", " 11\n", - " 75dbe6ba-b88c-4f66-ba78-b0ad25956453\n", + " d51af835-96b4-4ba9-b77e-543d964f2d30\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2398,7 +2400,7 @@ " 1\n", " 0\n", " 11\n", - " 523f8ace-a61e-4f27-9970-84581ba6626a\n", + " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2430,8 +2432,8 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "2 75dbe6ba-b88c-4f66-ba78-b0ad25956453 7758129997476962679 pdf \n", - "3 523f8ace-a61e-4f27-9970-84581ba6626a 14711865278795535908 pdf \n", + "2 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", + "3 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb index 69e705ae6..4028e1cc5 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb @@ -172,8 +172,12 @@ "import os, sys\n", "import shutil\n", "\n", - "input_dir = \"input\"\n", - "shutil.os.makedirs(input_dir, exist_ok=True)\n", + "if RUNNING_IN_COLAB:\n", + " input_dir = \"input\"\n", + " shutil.os.makedirs(input_dir, exist_ok=True)\n", + "else:\n", + " input_dir = \"../../data-files/pdf-processing-1/\"\n", + "\n", "output_dir = \"output\"\n", "\n", "output_pdf2pq_dir = os.path.join (output_dir, '01_pdf2pq_out')\n", @@ -319,28 +323,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "Local file 'input/earth.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/earth-copy.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/earth2.pdf' (58.53 KB) already exists. Skipping download.\n", - "Local file 'input/mars.pdf' (57.87 KB) already exists. Skipping download.\n", - "Local file 'input/spam.pdf' (24.87 KB) already exists. Skipping download.\n", - "Local file 'input/lorem-ipsum.pdf' (25.72 KB) already exists. Skipping download.\n" + "Using input files from : ../../data-files/pdf-processing-1/\n" ] } ], "source": [ + "if RUNNING_IN_COLAB:\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", "\n", - "download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))" + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + "else:\n", + " print ('Using input files from : ', input_dir)" ] }, { @@ -375,7 +377,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_pdf2pq_out'\n", + "🏃🏼 STAGE-1: Processing input='../../data-files/pdf-processing-1/' --> output='output/01_pdf2pq_out'\n", "\n" ] }, @@ -383,35 +385,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:30:38 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "11:30:38 INFO - pipeline id pipeline_id\n", - "11:30:38 INFO - code location None\n", - "11:30:38 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "11:30:38 INFO - actor creation delay 0\n", - "11:30:38 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:30:38 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_pdf2pq_out\n", - "11:30:38 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:30:38 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "11:30:38 INFO - Running locally\n", - "2025-01-29 11:30:39,945\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - orchestrator started at 2025-01-29 11:30:43\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.069149781018496, 'object_store': 4.534574889577925}\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:43 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=3068041)\u001b[0m 11:30:47 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 35378.38it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=3068041)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(RayTransformFileProcessor pid=3068042)\u001b[0m 11:30:47 INFO - Initializing models\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:58 INFO - Completed 1 files in 0.032 min\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:30:58 INFO - Completed 2 files in 0.033 min\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 183246.29it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=3068042)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:00 INFO - Completed 3 files in 0.063 min\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:00 INFO - Completed 4 files in 0.063 min\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:00 INFO - Completed 4 files (66.667%) in 0.063 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:02 INFO - Completed processing 6 files in 0.09 min\n", - "\u001b[36m(orchestrate pid=3067141)\u001b[0m 11:31:02 INFO - done flushing in 0.001 sec\n", - "11:31:12 INFO - Completed execution in 0.56 min, execution result 0\n" + "22:28:42 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "22:28:42 INFO - pipeline id pipeline_id\n", + "22:28:42 INFO - code location None\n", + "22:28:42 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "22:28:42 INFO - actor creation delay 0\n", + "22:28:42 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:28:42 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "22:28:42 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:28:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "22:28:42 INFO - Running locally\n", + "2025-02-04 22:28:44,917\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - orchestrator started at 2025-02-04 22:28:48\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.443094636313617, 'object_store': 4.721547316759825}\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=2137264)\u001b[0m 22:28:52 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 32402.35it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=2137265)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:02 INFO - Completed 1 files in 0.036 min\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:02 INFO - Completed 2 files in 0.036 min\n", + "\u001b[36m(RayTransformFileProcessor pid=2137265)\u001b[0m 22:28:52 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 44306.03it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=2137264)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:04 INFO - Completed 3 files in 0.07 min\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:04 INFO - Completed 4 files in 0.071 min\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:04 INFO - Completed 4 files (66.667%) in 0.071 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:06 INFO - Completed processing 6 files in 0.098 min\n", + "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:06 INFO - done flushing in 0.001 sec\n", + "22:29:16 INFO - Completed execution in 0.568 min, execution result 0\n" ] }, { @@ -515,13 +517,13 @@ " 1\n", " 0\n", " 2\n", - " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:31:00.543154\n", - " 1.836573\n", + " 2025-02-04T22:29:04.872187\n", + " 1.999260\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -531,13 +533,13 @@ " 1\n", " 0\n", " 2\n", - " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:31:02.125197\n", - " 1.579146\n", + " 2025-02-04T22:29:06.547382\n", + " 1.595728\n", " spam.pdf\n", " \n", " \n", @@ -547,13 +549,13 @@ " 1\n", " 0\n", " 11\n", - " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:31:00.528443\n", - " 1.846497\n", + " 2025-02-04T22:29:04.948884\n", + " 2.111419\n", " earth2.pdf\n", " \n", " \n", @@ -563,13 +565,13 @@ " 1\n", " 0\n", " 11\n", - " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:31:02.115064\n", - " 1.583783\n", + " 2025-02-04T22:29:06.454066\n", + " 1.578924\n", " mars.pdf\n", " \n", " \n", @@ -579,13 +581,13 @@ " 1\n", " 0\n", " 11\n", - " da835156-76ca-435f-bc0f-4fb1fca46097\n", + " a7ab12bd-7762-447f-b521-3e8d0d223111\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:30:58.702357\n", - " 1.957911\n", + " 2025-02-04T22:29:02.832786\n", + " 2.139017\n", " earth-copy.pdf\n", " \n", " \n", @@ -595,13 +597,13 @@ " 1\n", " 0\n", " 11\n", - " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:30:58.677819\n", - " 1.933069\n", + " 2025-02-04T22:29:02.868823\n", + " 2.174988\n", " earth.pdf\n", " \n", " \n", @@ -626,12 +628,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", - "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", - "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", - "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", - "4 da835156-76ca-435f-bc0f-4fb1fca46097 14711865278795535908 pdf \n", - "5 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", + "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", + "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", + "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", + "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", + "4 a7ab12bd-7762-447f-b521-3e8d0d223111 14711865278795535908 pdf \n", + "5 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -642,12 +644,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", - "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", - "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", - "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", - "4 2025-01-29T11:30:58.702357 1.957911 earth-copy.pdf \n", - "5 2025-01-29T11:30:58.677819 1.933069 earth.pdf " + "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", + "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", + "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", + "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", + "4 2025-02-04T22:29:02.832786 2.139017 earth-copy.pdf \n", + "5 2025-02-04T22:29:02.868823 2.174988 earth.pdf " ] }, "execution_count": 9, @@ -800,29 +802,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:31:13 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "11:31:13 INFO - pipeline id pipeline_id\n", - "11:31:13 INFO - code location None\n", - "11:31:13 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "11:31:13 INFO - actor creation delay 0\n", - "11:31:13 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:31:13 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", - "11:31:13 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:31:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:31:13 INFO - Running locally\n", - "2025-01-29 11:31:14,619\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - orchestrator started at 2025-01-29 11:31:15\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.08399505726993, 'object_store': 4.541997527703643}\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:15 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 4 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3068934)\u001b[0m 11:31:17 INFO - done flushing in 0.001 sec\n", - "11:31:27 INFO - Completed execution in 0.224 min, execution result 0\n" + "22:29:18 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "22:29:18 INFO - pipeline id pipeline_id\n", + "22:29:18 INFO - code location None\n", + "22:29:18 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "22:29:18 INFO - actor creation delay 0\n", + "22:29:18 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:29:18 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "22:29:18 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:29:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:29:18 INFO - Running locally\n", + "2025-02-04 22:29:19,283\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - orchestrator started at 2025-02-04 22:29:20\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.118695831857622, 'object_store': 4.5593479154631495}\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 1 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 2 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 3 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 4 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 4 files (66.667%) in 0.004 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed processing 6 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - done flushing in 0.001 sec\n", + "22:29:31 INFO - Completed execution in 0.228 min, execution result 0\n" ] }, { @@ -830,8 +832,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 124 ms, sys: 162 ms, total: 285 ms\n", - "Wall time: 14.9 s\n" + "CPU times: user 126 ms, sys: 136 ms, total: 262 ms\n", + "Wall time: 15 s\n" ] } ], @@ -932,16 +934,16 @@ " 1\n", " 0\n", " 2\n", - " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:31:00.543154\n", - " 1.836573\n", + " 2025-02-04T22:29:04.872187\n", + " 1.999260\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 3\n", + " 2\n", " \n", " \n", " 1\n", @@ -950,16 +952,16 @@ " 1\n", " 0\n", " 2\n", - " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:31:02.125197\n", - " 1.579146\n", + " 2025-02-04T22:29:06.547382\n", + " 1.595728\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", - " 5\n", + " 4\n", " \n", " \n", " 2\n", @@ -968,13 +970,13 @@ " 1\n", " 0\n", " 11\n", - " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:31:00.528443\n", - " 1.846497\n", + " 2025-02-04T22:29:04.948884\n", + " 2.111419\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 1\n", @@ -986,16 +988,16 @@ " 1\n", " 0\n", " 11\n", - " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:31:02.115064\n", - " 1.583783\n", + " 2025-02-04T22:29:06.454066\n", + " 1.578924\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", - " 4\n", + " 3\n", " \n", " \n", " 4\n", @@ -1004,13 +1006,13 @@ " 1\n", " 0\n", " 11\n", - " da835156-76ca-435f-bc0f-4fb1fca46097\n", + " a7ab12bd-7762-447f-b521-3e8d0d223111\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:30:58.702357\n", - " 1.957911\n", + " 2025-02-04T22:29:02.832786\n", + " 2.139017\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1022,16 +1024,16 @@ " 1\n", " 0\n", " 11\n", - " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:30:58.677819\n", - " 1.933069\n", + " 2025-02-04T22:29:02.868823\n", + " 2.174988\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 2\n", + " 5\n", " \n", " \n", "\n", @@ -1055,12 +1057,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", - "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", - "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", - "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", - "4 da835156-76ca-435f-bc0f-4fb1fca46097 14711865278795535908 pdf \n", - "5 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", + "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", + "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", + "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", + "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", + "4 a7ab12bd-7762-447f-b521-3e8d0d223111 14711865278795535908 pdf \n", + "5 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1071,20 +1073,20 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", - "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", - "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", - "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", - "4 2025-01-29T11:30:58.702357 1.957911 earth-copy.pdf \n", - "5 2025-01-29T11:30:58.677819 1.933069 earth.pdf \n", + "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", + "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", + "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", + "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", + "4 2025-02-04T22:29:02.832786 2.139017 earth-copy.pdf \n", + "5 2025-02-04T22:29:02.868823 2.174988 earth.pdf \n", "\n", " doc_hash int_id_column \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 2 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 4 \n", "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 3 \n", "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", - "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 2 " + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 5 " ] }, "execution_count": 14, @@ -1138,29 +1140,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:31:28 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "11:31:28 INFO - pipeline id pipeline_id\n", - "11:31:28 INFO - code location None\n", - "11:31:28 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "11:31:28 INFO - actor creation delay 0\n", - "11:31:28 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:31:28 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "11:31:28 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:31:28 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:31:28 INFO - Running locally\n", - "2025-01-29 11:31:29,530\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - orchestrator started at 2025-01-29 11:31:30\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.072644806466997, 'object_store': 4.536322402767837}\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:30 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 4 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:31 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:32 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3070594)\u001b[0m 11:31:32 INFO - done flushing in 0.001 sec\n", - "11:31:42 INFO - Completed execution in 0.223 min, execution result 0\n" + "22:29:33 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "22:29:33 INFO - pipeline id pipeline_id\n", + "22:29:33 INFO - code location None\n", + "22:29:33 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "22:29:33 INFO - actor creation delay 0\n", + "22:29:33 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:29:33 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "22:29:33 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:29:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:29:33 INFO - Running locally\n", + "2025-02-04 22:29:34,292\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - orchestrator started at 2025-02-04 22:29:35\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.146868897601962, 'object_store': 4.5734344478696585}\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - done flushing in 0.001 sec\n", + "22:29:46 INFO - Completed execution in 0.228 min, execution result 0\n" ] }, { @@ -1168,8 +1170,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 126 ms, sys: 128 ms, total: 254 ms\n", - "Wall time: 14.6 s\n" + "CPU times: user 139 ms, sys: 163 ms, total: 301 ms\n", + "Wall time: 15 s\n" ] } ], @@ -1273,16 +1275,16 @@ " 1\n", " 0\n", " 2\n", - " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:31:00.543154\n", - " 1.836573\n", + " 2025-02-04T22:29:04.872187\n", + " 1.999260\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 3\n", + " 2\n", " []\n", " \n", " \n", @@ -1292,16 +1294,16 @@ " 1\n", " 0\n", " 2\n", - " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:31:02.125197\n", - " 1.579146\n", + " 2025-02-04T22:29:06.547382\n", + " 1.595728\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", - " 5\n", + " 4\n", " []\n", " \n", " \n", @@ -1311,13 +1313,13 @@ " 1\n", " 0\n", " 11\n", - " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:31:00.528443\n", - " 1.846497\n", + " 2025-02-04T22:29:04.948884\n", + " 2.111419\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 1\n", @@ -1330,16 +1332,16 @@ " 1\n", " 0\n", " 11\n", - " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:31:02.115064\n", - " 1.583783\n", + " 2025-02-04T22:29:06.454066\n", + " 1.578924\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", - " 4\n", + " 3\n", " []\n", " \n", " \n", @@ -1349,16 +1351,16 @@ " 1\n", " 0\n", " 11\n", - " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:30:58.677819\n", - " 1.933069\n", + " 2025-02-04T22:29:02.868823\n", + " 2.174988\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 2\n", + " 5\n", " []\n", " \n", " \n", @@ -1381,11 +1383,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", - "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", - "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", - "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", - "4 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", + "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", + "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", + "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", + "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", + "4 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1395,18 +1397,18 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", - "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", - "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", - "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", - "4 2025-01-29T11:30:58.677819 1.933069 earth.pdf \n", + "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", + "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", + "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", + "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", + "4 2025-02-04T22:29:02.868823 2.174988 earth.pdf \n", "\n", " doc_hash int_id_column removed \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 2 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 4 [] \n", "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 [] \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 2 [] " + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 3 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 5 [] " ] }, "execution_count": 16, @@ -1484,133 +1486,133 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:31:43 INFO - Starting SignatureCalculation step\n", - "11:31:43 INFO - Got parameters for SignatureCalculation\n", - "11:31:43 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "11:31:43 INFO - data factory scdata_ is using local configuration without input/output path\n", - "11:31:43 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "11:31:43 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:31:43 INFO - pipeline id pipeline_id\n", - "11:31:43 INFO - code location None\n", - "11:31:43 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "11:31:43 INFO - actor creation delay 0\n", - "11:31:43 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:31:43 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "11:31:43 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:31:43 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:31:43 INFO - Running locally\n", - "2025-01-29 11:31:44,268\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - orchestrator started at 2025-01-29 11:31:45\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.094626618549228, 'object_store': 4.547313308343291}\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:45 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3072223)\u001b[0m 11:31:46 INFO - done flushing in 0.026 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=3073102)\u001b[0m 11:31:46 WARNING - table is empty, skipping processing\n", - "\u001b[36m(RayTransformFileProcessor pid=3073102)\u001b[0m 11:31:46 INFO - Starting flush()\n", - "\u001b[36m(RayTransformFileProcessor pid=3073100)\u001b[0m 11:31:46 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", - "11:31:56 INFO - Completed execution in 0.225 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=3073101)\u001b[0m 11:31:46 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", - "\u001b[36m(RayTransformFileProcessor pid=3073101)\u001b[0m 11:31:46 INFO - Wrote 14 tables with a total size of 26,880 bytes\n", - "11:31:58 INFO - SignatureCalculation completed successfully\n", - "11:31:58 INFO - Starting ClusterAnalysis step\n", - "11:31:58 INFO - Got parameters for ClusterAnalysis\n", - "11:31:58 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "11:31:58 INFO - pipeline id pipeline_id\n", - "11:31:58 INFO - code location None\n", - "11:31:58 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "11:31:58 INFO - actor creation delay 0\n", - "11:31:58 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:31:58 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "11:31:58 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:31:58 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:31:58 INFO - Running locally\n", - "2025-01-29 11:31:59,071\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - orchestrator started at 2025-01-29 11:32:00\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - Number of folders is 14\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.028518676757812, 'object_store': 4.514259338378906}\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:00 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 2 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 3 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 4 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 5 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 6 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 7 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 8 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 9 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 10 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 11 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - Completed processing 14 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3073829)\u001b[0m 11:32:01 INFO - done flushing in 0.001 sec\n", - "11:32:11 INFO - Completed execution in 0.222 min, execution result 0\n", - "11:32:12 INFO - ClusterAnalysis completed successfully\n", - "11:32:12 INFO - Starting GetDuplicateList step\n", - "11:32:12 INFO - Got parameters for GetDuplicateList\n", - "11:32:12 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "11:32:12 INFO - pipeline id pipeline_id\n", - "11:32:12 INFO - code location None\n", - "11:32:12 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "11:32:12 INFO - actor creation delay 0\n", - "11:32:12 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:32:12 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "11:32:12 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:32:12 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:32:12 INFO - Running locally\n", - "2025-01-29 11:32:13,701\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - orchestrator started at 2025-01-29 11:32:14\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - Number of folders is 1\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.045405579730868, 'object_store': 4.522702788934112}\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:14 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:16 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:16 INFO - Completed processing 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=3075422)\u001b[0m 11:32:16 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=3076312)\u001b[0m 11:32:16 INFO - Get Duplicate List for folder docs_to_remove\n", - "\u001b[36m(RayTransformFileProcessor pid=3076312)\u001b[0m 11:32:16 INFO - 0 documents marked as duplicates\n", - "11:32:26 INFO - Completed execution in 0.222 min, execution result 0\n", - "11:32:27 INFO - GetDuplicateList completed successfully\n", - "11:32:27 INFO - Starting DataCleaning step\n", - "11:32:27 INFO - Got parameters for DataCleaning\n", - "11:32:27 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "11:32:27 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "11:32:27 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "11:32:27 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:32:27 INFO - pipeline id pipeline_id\n", - "11:32:27 INFO - code location None\n", - "11:32:27 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "11:32:27 INFO - actor creation delay 0\n", - "11:32:27 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:32:27 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "11:32:27 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:32:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:32:27 INFO - Running locally\n", - "2025-01-29 11:32:28,365\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - orchestrator started at 2025-01-29 11:32:29\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.023682404309511, 'object_store': 4.511841201223433}\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:29 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3076959)\u001b[0m 11:32:30 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=3077841)\u001b[0m 11:32:30 WARNING - table is empty, skipping processing\n", - "11:32:40 INFO - Completed execution in 0.227 min, execution result 0\n", - "11:32:42 INFO - DataCleaning completed successfully\n" + "22:29:48 INFO - Starting SignatureCalculation step\n", + "22:29:48 INFO - Got parameters for SignatureCalculation\n", + "22:29:48 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "22:29:48 INFO - data factory scdata_ is using local configuration without input/output path\n", + "22:29:48 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "22:29:48 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:29:48 INFO - pipeline id pipeline_id\n", + "22:29:48 INFO - code location None\n", + "22:29:48 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "22:29:48 INFO - actor creation delay 0\n", + "22:29:48 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:29:48 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "22:29:48 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:29:48 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:29:48 INFO - Running locally\n", + "2025-02-04 22:29:49,424\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - orchestrator started at 2025-02-04 22:29:50\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.059674073010683, 'object_store': 4.529837035574019}\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 1 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 2 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 3 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 3 files (50.0%) in 0.004 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed processing 6 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - done flushing in 0.029 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2142575)\u001b[0m 22:29:52 INFO - Starting flush()\n", + "\u001b[36m(RayTransformFileProcessor pid=2142575)\u001b[0m 22:29:52 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", + "\u001b[36m(RayTransformFileProcessor pid=2142576)\u001b[0m 22:29:52 WARNING - table is empty, skipping processing\n", + "22:30:02 INFO - Completed execution in 0.23 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=2142576)\u001b[0m 22:29:52 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=2142576)\u001b[0m 22:29:52 INFO - Wrote 14 tables with a total size of 6,720 bytes\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "22:30:03 INFO - SignatureCalculation completed successfully\n", + "22:30:03 INFO - Starting ClusterAnalysis step\n", + "22:30:03 INFO - Got parameters for ClusterAnalysis\n", + "22:30:03 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "22:30:03 INFO - pipeline id pipeline_id\n", + "22:30:03 INFO - code location None\n", + "22:30:03 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "22:30:03 INFO - actor creation delay 0\n", + "22:30:03 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:30:03 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "22:30:03 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:30:03 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:30:03 INFO - Running locally\n", + "2025-02-04 22:30:04,465\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - orchestrator started at 2025-02-04 22:30:05\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.936753083020449, 'object_store': 4.468376540578902}\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 7 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 8 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 9 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 10 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 11 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed processing 14 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - done flushing in 0.001 sec\n", + "22:30:16 INFO - Completed execution in 0.222 min, execution result 0\n", + "22:30:18 INFO - ClusterAnalysis completed successfully\n", + "22:30:18 INFO - Starting GetDuplicateList step\n", + "22:30:18 INFO - Got parameters for GetDuplicateList\n", + "22:30:18 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "22:30:18 INFO - pipeline id pipeline_id\n", + "22:30:18 INFO - code location None\n", + "22:30:18 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "22:30:18 INFO - actor creation delay 0\n", + "22:30:18 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:30:18 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "22:30:18 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:30:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:30:18 INFO - Running locally\n", + "2025-02-04 22:30:18,965\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - orchestrator started at 2025-02-04 22:30:20\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.91086196899414, 'object_store': 4.45543098449707}\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:21 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(RayTransformFileProcessor pid=2145840)\u001b[0m 22:30:21 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=2145840)\u001b[0m 22:30:21 INFO - 0 documents marked as duplicates\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:21 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:21 INFO - done flushing in 0.001 sec\n", + "22:30:31 INFO - Completed execution in 0.222 min, execution result 0\n", + "22:30:32 INFO - GetDuplicateList completed successfully\n", + "22:30:32 INFO - Starting DataCleaning step\n", + "22:30:32 INFO - Got parameters for DataCleaning\n", + "22:30:32 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "22:30:32 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "22:30:32 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "22:30:32 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:30:32 INFO - pipeline id pipeline_id\n", + "22:30:32 INFO - code location None\n", + "22:30:32 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "22:30:32 INFO - actor creation delay 0\n", + "22:30:32 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:30:32 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "22:30:32 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:30:32 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:30:32 INFO - Running locally\n", + "2025-02-04 22:30:33,492\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - orchestrator started at 2025-02-04 22:30:34\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.780509186908603, 'object_store': 4.390254592522979}\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2147461)\u001b[0m 22:30:36 WARNING - table is empty, skipping processing\n", + "22:30:46 INFO - Completed execution in 0.227 min, execution result 0\n", + "22:30:47 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 517 ms, sys: 575 ms, total: 1.09 s\n", - "Wall time: 59.1 s\n" + "CPU times: user 474 ms, sys: 520 ms, total: 994 ms\n", + "Wall time: 59.4 s\n" ] } ], @@ -1719,16 +1721,16 @@ " 1\n", " 0\n", " 2\n", - " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-01-29T11:31:00.543154\n", - " 1.836573\n", + " 2025-02-04T22:29:04.872187\n", + " 1.999260\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 3\n", + " 2\n", " []\n", " \n", " \n", @@ -1738,16 +1740,16 @@ " 1\n", " 0\n", " 2\n", - " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-01-29T11:31:02.125197\n", - " 1.579146\n", + " 2025-02-04T22:29:06.547382\n", + " 1.595728\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", - " 5\n", + " 4\n", " []\n", " \n", " \n", @@ -1757,13 +1759,13 @@ " 1\n", " 0\n", " 11\n", - " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-01-29T11:31:00.528443\n", - " 1.846497\n", + " 2025-02-04T22:29:04.948884\n", + " 2.111419\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 1\n", @@ -1776,16 +1778,16 @@ " 1\n", " 0\n", " 11\n", - " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-01-29T11:31:02.115064\n", - " 1.583783\n", + " 2025-02-04T22:29:06.454066\n", + " 1.578924\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", - " 4\n", + " 3\n", " []\n", " \n", " \n", @@ -1795,16 +1797,16 @@ " 1\n", " 0\n", " 11\n", - " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-01-29T11:30:58.677819\n", - " 1.933069\n", + " 2025-02-04T22:29:02.868823\n", + " 2.174988\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 2\n", + " 5\n", " []\n", " \n", " \n", @@ -1827,11 +1829,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", - "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", - "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", - "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", - "4 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", + "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", + "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", + "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", + "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", + "4 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1841,18 +1843,18 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-01-29T11:31:00.543154 1.836573 lorem-ipsum.pdf \n", - "1 2025-01-29T11:31:02.125197 1.579146 spam.pdf \n", - "2 2025-01-29T11:31:00.528443 1.846497 earth2.pdf \n", - "3 2025-01-29T11:31:02.115064 1.583783 mars.pdf \n", - "4 2025-01-29T11:30:58.677819 1.933069 earth.pdf \n", + "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", + "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", + "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", + "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", + "4 2025-02-04T22:29:02.868823 2.174988 earth.pdf \n", "\n", " doc_hash int_id_column removed \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 2 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 4 [] \n", "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 [] \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 2 [] " + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 3 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 5 [] " ] }, "execution_count": 18, @@ -1916,33 +1918,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:32:42 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "11:32:42 INFO - data factory docq_ is using local configuration without input/output path\n", - "11:32:42 INFO - data factory docq_ max_files -1, n_sample -1\n", - "11:32:42 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:32:42 INFO - pipeline id pipeline_id\n", - "11:32:42 INFO - code location None\n", - "11:32:42 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "11:32:42 INFO - actor creation delay 0\n", - "11:32:42 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", - "11:32:42 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "11:32:42 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:32:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:32:42 INFO - Running locally\n", - "2025-01-29 11:32:43,435\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - orchestrator started at 2025-01-29 11:32:44\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.047461509704589844}\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.050894166342914, 'object_store': 4.525447081774473}\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:44 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=3079444)\u001b[0m 11:32:45 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - Completed processing 5 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=3078577)\u001b[0m 11:32:46 INFO - done flushing in 0.001 sec\n", - "11:32:56 INFO - Completed execution in 0.226 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=3079443)\u001b[0m 11:32:45 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" + "22:30:47 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "22:30:47 INFO - data factory docq_ is using local configuration without input/output path\n", + "22:30:47 INFO - data factory docq_ max_files -1, n_sample -1\n", + "22:30:47 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:30:47 INFO - pipeline id pipeline_id\n", + "22:30:47 INFO - code location None\n", + "22:30:47 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "22:30:47 INFO - actor creation delay 0\n", + "22:30:47 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", + "22:30:47 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "22:30:47 INFO - data factory data_ max_files -1, n_sample -1\n", + "22:30:47 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "22:30:47 INFO - Running locally\n", + "2025-02-04 22:30:48,687\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - orchestrator started at 2025-02-04 22:30:49\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.047461509704589844}\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.778627778403461, 'object_store': 4.389313887804747}\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=2149145)\u001b[0m 22:30:50 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed processing 5 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - done flushing in 0.001 sec\n", + "22:31:01 INFO - Completed execution in 0.224 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=2149144)\u001b[0m 22:30:50 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" ] }, { @@ -1950,8 +1952,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 121 ms, sys: 170 ms, total: 290 ms\n", - "Wall time: 15 s\n" + "CPU times: user 116 ms, sys: 153 ms, total: 269 ms\n", + "Wall time: 14.7 s\n" ] } ], @@ -2063,7 +2065,7 @@ " 1\n", " 0\n", " 2\n", - " 10a6bad2-d52d-4a9f-a735-e19d35055811\n", + " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2087,7 +2089,7 @@ " 1\n", " 0\n", " 2\n", - " 8b989cd0-0439-4c9c-9d3e-5851b72d4eff\n", + " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2111,7 +2113,7 @@ " 1\n", " 0\n", " 11\n", - " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", @@ -2135,7 +2137,7 @@ " 1\n", " 0\n", " 11\n", - " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2159,7 +2161,7 @@ " 1\n", " 0\n", " 11\n", - " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2197,11 +2199,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 10a6bad2-d52d-4a9f-a735-e19d35055811 6571294142213095721 pdf \n", - "1 8b989cd0-0439-4c9c-9d3e-5851b72d4eff 10026122586747302274 pdf \n", - "2 035517d0-a99f-4ccb-ab3f-5dab83f64f6b 10729312978404042321 pdf \n", - "3 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 7758129997476962679 pdf \n", - "4 8567cfde-a3eb-440b-b758-5948d7706088 14711865278795535908 pdf \n", + "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", + "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", + "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", + "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", + "4 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2324,7 +2326,7 @@ " 1\n", " 0\n", " 11\n", - " 035517d0-a99f-4ccb-ab3f-5dab83f64f6b\n", + " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", @@ -2348,7 +2350,7 @@ " 1\n", " 0\n", " 11\n", - " 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9\n", + " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2372,7 +2374,7 @@ " 1\n", " 0\n", " 11\n", - " 8567cfde-a3eb-440b-b758-5948d7706088\n", + " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2401,9 +2403,9 @@ "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... 1 \n", "\n", " num_tables num_doc_elements document_id \\\n", - "2 0 11 035517d0-a99f-4ccb-ab3f-5dab83f64f6b \n", - "3 0 11 22d4ce57-0b54-4b4f-bd5c-765919b4d5c9 \n", - "4 0 11 8567cfde-a3eb-440b-b758-5948d7706088 \n", + "2 0 11 8c65dd19-fa9c-47ae-9099-f8daeb568755 \n", + "3 0 11 2cef2fa6-ffa7-477f-a2f5-323ee68289aa \n", + "4 0 11 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 \n", "\n", " document_hash ext \\\n", "2 10729312978404042321 pdf \n", From 6b04bdee474d086d11b1c81d1d354014b5729016 Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Tue, 4 Feb 2025 23:11:36 -0800 Subject: [PATCH 6/6] Fixed data files path Signed-off-by: Sujee Maniyam --- .../pdf_processing_1_python.ipynb | 618 ++++++------- .../pdf_processing_1_ray.ipynb | 834 +++++++++--------- 2 files changed, 726 insertions(+), 726 deletions(-) diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb index e4f9aa713..e6b4cb951 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_python.ipynb @@ -320,13 +320,13 @@ "source": [ "## Step-3: Inspect the Data\n", "\n", - "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/input/)\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/)\n", "\n", - "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf)\n", - "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", - "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf)\n", - "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf) - contains spammy contents\n", - "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" ] }, { @@ -362,17 +362,17 @@ "source": [ "if RUNNING_IN_COLAB:\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", "else:\n", " print ('Using input files from : ', input_dir)" ] @@ -441,21 +441,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:09:52 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "22:09:52 INFO - pipeline id pipeline_id\n", - "22:09:52 INFO - code location None\n", - "22:09:52 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", - "22:09:52 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:09:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "22:09:52 INFO - orchestrator pdf2parquet started at 2025-02-04 22:09:52\n", - "22:09:52 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "22:09:52 INFO - Initializing models\n" + "23:06:13 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "23:06:13 INFO - pipeline id pipeline_id\n", + "23:06:13 INFO - code location None\n", + "23:06:13 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "23:06:13 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "23:06:13 INFO - orchestrator pdf2parquet started at 2025-02-04 23:06:13\n", + "23:06:13 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "23:06:13 INFO - Initializing models\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "730bac430d12421f9c300c4ee68b11ca", + "model_id": "21a1c68550c848cba79340080a1ccde4", "version_major": 2, "version_minor": 0 }, @@ -470,15 +470,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:09:57 INFO - Completed 1 files (16.67%) in 0.019 min\n", - "22:09:58 INFO - Completed 2 files (33.33%) in 0.034 min\n", - "22:09:58 INFO - Completed 3 files (50.0%) in 0.044 min\n", - "22:09:59 INFO - Completed 4 files (66.67%) in 0.054 min\n", - "22:10:00 INFO - Completed 5 files (83.33%) in 0.065 min\n", - "22:10:00 INFO - Completed 6 files (100.0%) in 0.076 min\n", - "22:10:00 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:10:00 INFO - done flushing in 0.0 sec\n", - "22:10:00 INFO - Completed execution in 0.139 min, execution result 0\n" + "23:06:18 INFO - Completed 1 files (16.67%) in 0.018 min\n", + "23:06:19 INFO - Completed 2 files (33.33%) in 0.033 min\n", + "23:06:19 INFO - Completed 3 files (50.0%) in 0.044 min\n", + "23:06:20 INFO - Completed 4 files (66.67%) in 0.055 min\n", + "23:06:21 INFO - Completed 5 files (83.33%) in 0.067 min\n", + "23:06:21 INFO - Completed 6 files (100.0%) in 0.078 min\n", + "23:06:21 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:21 INFO - done flushing in 0.0 sec\n", + "23:06:21 INFO - Completed execution in 0.141 min, execution result 0\n" ] }, { @@ -486,8 +486,8 @@ "output_type": "stream", "text": [ "✅ Stage:1 completed successfully\n", - "CPU times: user 20.8 s, sys: 2.35 s, total: 23.1 s\n", - "Wall time: 12.4 s\n" + "CPU times: user 21.5 s, sys: 1.94 s, total: 23.5 s\n", + "Wall time: 11.3 s\n" ] } ], @@ -588,13 +588,13 @@ " 1\n", " 0\n", " 2\n", - " da0c5fcb-38a7-461e-965d-d76b8b759190\n", + " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:09:59.393899\n", - " 0.611141\n", + " 2025-02-04T23:06:20.470544\n", + " 0.693593\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -604,13 +604,13 @@ " 1\n", " 0\n", " 2\n", - " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", + " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:10:00.665091\n", - " 0.609956\n", + " 2025-02-04T23:06:21.819893\n", + " 0.676735\n", " spam.pdf\n", " \n", " \n", @@ -620,13 +620,13 @@ " 1\n", " 0\n", " 11\n", - " 281e5825-705a-4710-941c-82ce3628dfd9\n", + " 875d0907-8dd3-4ef9-b3b0-a0083e7ad438\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:09:58.781029\n", - " 0.619509\n", + " 2025-02-04T23:06:19.774915\n", + " 0.641045\n", " earth2.pdf\n", " \n", " \n", @@ -636,13 +636,13 @@ " 1\n", " 0\n", " 11\n", - " d51af835-96b4-4ba9-b77e-543d964f2d30\n", + " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:10:00.053520\n", - " 0.657007\n", + " 2025-02-04T23:06:21.141230\n", + " 0.668992\n", " mars.pdf\n", " \n", " \n", @@ -652,13 +652,13 @@ " 1\n", " 0\n", " 11\n", - " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", + " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:09:57.243139\n", - " 1.110305\n", + " 2025-02-04T23:06:18.199803\n", + " 1.053618\n", " earth-copy.pdf\n", " \n", " \n", @@ -668,13 +668,13 @@ " 1\n", " 0\n", " 11\n", - " f75a8fad-c39a-42bd-9c47-a0975312bc5d\n", + " c6c18475-9365-4325-85dc-8acf6b969d8f\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:09:58.159736\n", - " 0.902815\n", + " 2025-02-04T23:06:19.132090\n", + " 0.929218\n", " earth.pdf\n", " \n", " \n", @@ -699,12 +699,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", - "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", - "2 281e5825-705a-4710-941c-82ce3628dfd9 10729312978404042321 pdf \n", - "3 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", - "4 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", - "5 f75a8fad-c39a-42bd-9c47-a0975312bc5d 14711865278795535908 pdf \n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", + "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "5 c6c18475-9365-4325-85dc-8acf6b969d8f 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -715,12 +715,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", - "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", - "2 2025-02-04T22:09:58.781029 0.619509 earth2.pdf \n", - "3 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", - "4 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", - "5 2025-02-04T22:09:58.159736 0.902815 earth.pdf " + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", + "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "5 2025-02-04T23:06:19.132090 0.929218 earth.pdf " ] }, "execution_count": 8, @@ -897,23 +897,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:10:00 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "22:10:00 INFO - pipeline id pipeline_id\n", - "22:10:00 INFO - code location None\n", - "22:10:00 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", - "22:10:00 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:00 INFO - orchestrator doc_id started at 2025-02-04 22:10:00\n", - "22:10:00 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "22:10:00 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:10:00 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:10:00 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:10:00 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:10:00 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:10:00 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:10:00 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:10:00 INFO - done flushing in 0.0 sec\n", - "22:10:00 INFO - Completed execution in 0.0 min, execution result 0\n" + "23:06:22 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator doc_id started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -921,8 +921,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 28.4 ms, sys: 2.92 ms, total: 31.3 ms\n", - "Wall time: 26.8 ms\n" + "CPU times: user 27.6 ms, sys: 2.32 ms, total: 29.9 ms\n", + "Wall time: 23.4 ms\n" ] } ], @@ -1027,13 +1027,13 @@ " 1\n", " 0\n", " 2\n", - " da0c5fcb-38a7-461e-965d-d76b8b759190\n", + " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:09:59.393899\n", - " 0.611141\n", + " 2025-02-04T23:06:20.470544\n", + " 0.693593\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1045,13 +1045,13 @@ " 1\n", " 0\n", " 2\n", - " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", + " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:10:00.665091\n", - " 0.609956\n", + " 2025-02-04T23:06:21.819893\n", + " 0.676735\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1063,13 +1063,13 @@ " 1\n", " 0\n", " 11\n", - " 281e5825-705a-4710-941c-82ce3628dfd9\n", + " 875d0907-8dd3-4ef9-b3b0-a0083e7ad438\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:09:58.781029\n", - " 0.619509\n", + " 2025-02-04T23:06:19.774915\n", + " 0.641045\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1081,13 +1081,13 @@ " 1\n", " 0\n", " 11\n", - " d51af835-96b4-4ba9-b77e-543d964f2d30\n", + " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:10:00.053520\n", - " 0.657007\n", + " 2025-02-04T23:06:21.141230\n", + " 0.668992\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1099,13 +1099,13 @@ " 1\n", " 0\n", " 11\n", - " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", + " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:09:57.243139\n", - " 1.110305\n", + " 2025-02-04T23:06:18.199803\n", + " 1.053618\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1117,13 +1117,13 @@ " 1\n", " 0\n", " 11\n", - " f75a8fad-c39a-42bd-9c47-a0975312bc5d\n", + " c6c18475-9365-4325-85dc-8acf6b969d8f\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:09:58.159736\n", - " 0.902815\n", + " 2025-02-04T23:06:19.132090\n", + " 0.929218\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 1\n", @@ -1150,12 +1150,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", - "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", - "2 281e5825-705a-4710-941c-82ce3628dfd9 10729312978404042321 pdf \n", - "3 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", - "4 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", - "5 f75a8fad-c39a-42bd-9c47-a0975312bc5d 14711865278795535908 pdf \n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", + "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", + "5 c6c18475-9365-4325-85dc-8acf6b969d8f 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1166,12 +1166,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", - "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", - "2 2025-02-04T22:09:58.781029 0.619509 earth2.pdf \n", - "3 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", - "4 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", - "5 2025-02-04T22:09:58.159736 0.902815 earth.pdf \n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", + "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", + "5 2025-02-04T23:06:19.132090 0.929218 earth.pdf \n", "\n", " doc_hash int_id_column \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", @@ -1243,24 +1243,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:10:00 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", - "22:10:00 INFO - pipeline id pipeline_id\n", - "22:10:00 INFO - code location None\n", - "22:10:00 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "22:10:00 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:00 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:00 INFO - orchestrator ededup started at 2025-02-04 22:10:00\n", - "22:10:00 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "22:10:00 INFO - Starting from the beginning\n", - "22:10:00 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:10:00 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:10:00 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:10:00 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:10:00 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:10:00 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:10:00 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:10:00 INFO - done flushing in 0.0 sec\n", - "22:10:00 INFO - Completed execution in 0.0 min, execution result 0\n" + "23:06:22 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator ededup started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "23:06:22 INFO - Starting from the beginning\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -1268,8 +1268,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 26.8 ms, sys: 11.8 ms, total: 38.6 ms\n", - "Wall time: 32.8 ms\n" + "CPU times: user 37.3 ms, sys: 3.56 ms, total: 40.9 ms\n", + "Wall time: 36.4 ms\n" ] } ], @@ -1375,13 +1375,13 @@ " 1\n", " 0\n", " 2\n", - " da0c5fcb-38a7-461e-965d-d76b8b759190\n", + " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:09:59.393899\n", - " 0.611141\n", + " 2025-02-04T23:06:20.470544\n", + " 0.693593\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1394,13 +1394,13 @@ " 1\n", " 0\n", " 2\n", - " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", + " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:10:00.665091\n", - " 0.609956\n", + " 2025-02-04T23:06:21.819893\n", + " 0.676735\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1413,13 +1413,13 @@ " 1\n", " 0\n", " 11\n", - " 281e5825-705a-4710-941c-82ce3628dfd9\n", + " 875d0907-8dd3-4ef9-b3b0-a0083e7ad438\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:09:58.781029\n", - " 0.619509\n", + " 2025-02-04T23:06:19.774915\n", + " 0.641045\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 2\n", @@ -1432,13 +1432,13 @@ " 1\n", " 0\n", " 11\n", - " d51af835-96b4-4ba9-b77e-543d964f2d30\n", + " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:10:00.053520\n", - " 0.657007\n", + " 2025-02-04T23:06:21.141230\n", + " 0.668992\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1451,13 +1451,13 @@ " 1\n", " 0\n", " 11\n", - " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", + " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:09:57.243139\n", - " 1.110305\n", + " 2025-02-04T23:06:18.199803\n", + " 1.053618\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1483,11 +1483,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", - "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", - "2 281e5825-705a-4710-941c-82ce3628dfd9 10729312978404042321 pdf \n", - "3 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", - "4 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 875d0907-8dd3-4ef9-b3b0-a0083e7ad438 10729312978404042321 pdf \n", + "3 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "4 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1497,11 +1497,11 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", - "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", - "2 2025-02-04T22:09:58.781029 0.619509 earth2.pdf \n", - "3 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", - "4 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:19.774915 0.641045 earth2.pdf \n", + "3 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "4 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1596,109 +1596,109 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:10:01 INFO - Starting SignatureCalculation step\n", - "22:10:01 INFO - Got parameters for SignatureCalculation\n", - "22:10:01 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "22:10:01 INFO - data factory scdata_ is using local configuration without input/output path\n", - "22:10:01 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - pipeline id pipeline_id\n", - "22:10:01 INFO - code location None\n", - "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - orchestrator minhash started at 2025-02-04 22:10:01\n", - "22:10:01 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "22:10:01 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:10:01 WARNING - table is empty, skipping processing\n", - "22:10:01 INFO - Completed 2 files (33.33%) in 0.001 min\n", - "22:10:01 INFO - Completed 3 files (50.0%) in 0.001 min\n", - "22:10:01 INFO - Completed 4 files (66.67%) in 0.001 min\n", - "22:10:01 INFO - Completed 5 files (83.33%) in 0.001 min\n", - "22:10:01 INFO - Completed 6 files (100.0%) in 0.001 min\n", - "22:10:01 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:10:01 INFO - Starting flush()\n", - "22:10:01 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", - "22:10:01 INFO - done flushing in 0.028 sec\n", - "22:10:01 INFO - Completed execution in 0.001 min, execution result 0\n", - "22:10:01 INFO - SignatureCalculation completed successfully\n", - "22:10:01 INFO - Starting ClusterAnalysis step\n", - "22:10:01 INFO - Got parameters for ClusterAnalysis\n", - "22:10:01 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "22:10:01 INFO - pipeline id pipeline_id\n", - "22:10:01 INFO - code location None\n", - "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - orchestrator cluster started at 2025-02-04 22:10:01\n", - "22:10:01 INFO - Number of folders is 14\n", - "22:10:01 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "22:10:01 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "22:10:01 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "22:10:01 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "22:10:01 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "22:10:01 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "22:10:01 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "22:10:01 INFO - Completed 8 files (57.14%) in 0.001 min\n", - "22:10:01 INFO - Completed 9 files (64.29%) in 0.001 min\n", - "22:10:01 INFO - Completed 10 files (71.43%) in 0.001 min\n", - "22:10:01 INFO - Completed 11 files (78.57%) in 0.001 min\n", - "22:10:01 INFO - Completed 12 files (85.71%) in 0.001 min\n", - "22:10:01 INFO - Completed 13 files (92.86%) in 0.001 min\n", - "22:10:01 INFO - Completed 14 files (100.0%) in 0.001 min\n", - "22:10:01 INFO - Done processing 14 files, waiting for flush() completion.\n", - "22:10:01 INFO - done flushing in 0.0 sec\n", - "22:10:01 INFO - Completed execution in 0.001 min, execution result 0\n", - "22:10:01 INFO - ClusterAnalysis completed successfully\n", - "22:10:01 INFO - Starting GetDuplicateList step\n", - "22:10:01 INFO - Got parameters for GetDuplicateList\n", - "22:10:01 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "22:10:01 INFO - pipeline id pipeline_id\n", - "22:10:01 INFO - code location None\n", - "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - orchestrator fdlist started at 2025-02-04 22:10:01\n", - "22:10:01 INFO - Number of folders is 1\n", - "22:10:01 INFO - Get Duplicate List for folder docs_to_remove\n", - "22:10:01 INFO - 1 documents marked as duplicates\n", - "22:10:01 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "22:10:01 INFO - Done processing 1 files, waiting for flush() completion.\n", - "22:10:01 INFO - done flushing in 0.0 sec\n", - "22:10:01 INFO - Completed execution in 0.0 min, execution result 0\n", - "22:10:01 INFO - GetDuplicateList completed successfully\n", - "22:10:01 INFO - Starting DataCleaning step\n", - "22:10:01 INFO - Got parameters for DataCleaning\n", - "22:10:01 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "22:10:01 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "22:10:01 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - pipeline id pipeline_id\n", - "22:10:01 INFO - code location None\n", - "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - orchestrator fdclean started at 2025-02-04 22:10:01\n", - "22:10:01 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", - "22:10:01 INFO - Completed 1 files (16.67%) in 0.0 min\n", - "22:10:01 WARNING - table is empty, skipping processing\n", - "22:10:01 INFO - Completed 2 files (33.33%) in 0.0 min\n", - "22:10:01 INFO - Completed 3 files (50.0%) in 0.0 min\n", - "22:10:01 INFO - Completed 4 files (66.67%) in 0.0 min\n", - "22:10:01 INFO - Completed 5 files (83.33%) in 0.0 min\n", - "22:10:01 INFO - Completed 6 files (100.0%) in 0.0 min\n", - "22:10:01 INFO - Done processing 6 files, waiting for flush() completion.\n", - "22:10:01 INFO - done flushing in 0.0 sec\n", - "22:10:01 INFO - Completed execution in 0.0 min, execution result 0\n", - "22:10:01 INFO - DataCleaning completed successfully\n" + "23:06:22 INFO - Starting SignatureCalculation step\n", + "23:06:22 INFO - Got parameters for SignatureCalculation\n", + "23:06:22 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.8, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "23:06:22 INFO - data factory scdata_ is using local configuration without input/output path\n", + "23:06:22 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator minhash started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 WARNING - table is empty, skipping processing\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - Starting flush()\n", + "23:06:22 INFO - Wrote 14 tables with a total size of 33,600 bytes\n", + "23:06:22 INFO - done flushing in 0.028 sec\n", + "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:06:22 INFO - SignatureCalculation completed successfully\n", + "23:06:22 INFO - Starting ClusterAnalysis step\n", + "23:06:22 INFO - Got parameters for ClusterAnalysis\n", + "23:06:22 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.8, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator cluster started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of folders is 14\n", + "23:06:22 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "23:06:22 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "23:06:22 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "23:06:22 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 8 files (57.14%) in 0.0 min\n", + "23:06:22 INFO - Completed 9 files (64.29%) in 0.001 min\n", + "23:06:22 INFO - Completed 10 files (71.43%) in 0.001 min\n", + "23:06:22 INFO - Completed 11 files (78.57%) in 0.001 min\n", + "23:06:22 INFO - Completed 12 files (85.71%) in 0.001 min\n", + "23:06:22 INFO - Completed 13 files (92.86%) in 0.001 min\n", + "23:06:22 INFO - Completed 14 files (100.0%) in 0.001 min\n", + "23:06:22 INFO - Done processing 14 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:06:22 INFO - ClusterAnalysis completed successfully\n", + "23:06:22 INFO - Starting GetDuplicateList step\n", + "23:06:22 INFO - Got parameters for GetDuplicateList\n", + "23:06:22 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator fdlist started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of folders is 1\n", + "23:06:22 INFO - Get Duplicate List for folder docs_to_remove\n", + "23:06:22 INFO - 1 documents marked as duplicates\n", + "23:06:22 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 1 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n", + "23:06:22 INFO - GetDuplicateList completed successfully\n", + "23:06:22 INFO - Starting DataCleaning step\n", + "23:06:22 INFO - Got parameters for DataCleaning\n", + "23:06:22 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "23:06:22 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "23:06:22 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator fdclean started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "23:06:22 INFO - Completed 1 files (16.67%) in 0.0 min\n", + "23:06:22 WARNING - table is empty, skipping processing\n", + "23:06:22 INFO - Completed 2 files (33.33%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (50.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (66.67%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (83.33%) in 0.001 min\n", + "23:06:22 INFO - Completed 6 files (100.0%) in 0.001 min\n", + "23:06:22 INFO - Done processing 6 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.001 min, execution result 0\n", + "23:06:22 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 232 ms, sys: 98.5 ms, total: 331 ms\n", - "Wall time: 277 ms\n" + "CPU times: user 288 ms, sys: 114 ms, total: 402 ms\n", + "Wall time: 262 ms\n" ] } ], @@ -1812,13 +1812,13 @@ " 1\n", " 0\n", " 2\n", - " da0c5fcb-38a7-461e-965d-d76b8b759190\n", + " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:09:59.393899\n", - " 0.611141\n", + " 2025-02-04T23:06:20.470544\n", + " 0.693593\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 3\n", @@ -1831,13 +1831,13 @@ " 1\n", " 0\n", " 2\n", - " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", + " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:10:00.665091\n", - " 0.609956\n", + " 2025-02-04T23:06:21.819893\n", + " 0.676735\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 5\n", @@ -1850,13 +1850,13 @@ " 1\n", " 0\n", " 11\n", - " d51af835-96b4-4ba9-b77e-543d964f2d30\n", + " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:10:00.053520\n", - " 0.657007\n", + " 2025-02-04T23:06:21.141230\n", + " 0.668992\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 4\n", @@ -1869,13 +1869,13 @@ " 1\n", " 0\n", " 11\n", - " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", + " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:09:57.243139\n", - " 1.110305\n", + " 2025-02-04T23:06:18.199803\n", + " 1.053618\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 0\n", @@ -1899,10 +1899,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", - "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", - "2 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", - "3 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1911,10 +1911,10 @@ "3 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T22:09:59.393899 0.611141 lorem-ipsum.pdf \n", - "1 2025-02-04T22:10:00.665091 0.609956 spam.pdf \n", - "2 2025-02-04T22:10:00.053520 0.657007 mars.pdf \n", - "3 2025-02-04T22:09:57.243139 1.110305 earth-copy.pdf \n", + "0 2025-02-04T23:06:20.470544 0.693593 lorem-ipsum.pdf \n", + "1 2025-02-04T23:06:21.819893 0.676735 spam.pdf \n", + "2 2025-02-04T23:06:21.141230 0.668992 mars.pdf \n", + "3 2025-02-04T23:06:18.199803 1.053618 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", @@ -1994,27 +1994,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:10:01 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "22:10:01 INFO - data factory docq_ is using local configuration without input/output path\n", - "22:10:01 INFO - data factory docq_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - pipeline id pipeline_id\n", - "22:10:01 INFO - code location None\n", - "22:10:01 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "22:10:01 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:10:01 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:10:01 INFO - orchestrator docq started at 2025-02-04 22:10:01\n", - "22:10:01 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", - "22:10:01 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "22:10:01 INFO - Completed 1 files (20.0%) in 0.0 min\n", - "22:10:01 WARNING - table is empty, skipping processing\n", - "22:10:01 INFO - Completed 2 files (40.0%) in 0.0 min\n", - "22:10:01 INFO - Completed 3 files (60.0%) in 0.0 min\n", - "22:10:01 INFO - Completed 4 files (80.0%) in 0.001 min\n", - "22:10:01 INFO - Completed 5 files (100.0%) in 0.001 min\n", - "22:10:01 INFO - Done processing 5 files, waiting for flush() completion.\n", - "22:10:01 INFO - done flushing in 0.0 sec\n", - "22:10:01 INFO - Completed execution in 0.001 min, execution result 0\n" + "23:06:22 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "23:06:22 INFO - data factory docq_ is using local configuration without input/output path\n", + "23:06:22 INFO - data factory docq_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - pipeline id pipeline_id\n", + "23:06:22 INFO - code location None\n", + "23:06:22 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "23:06:22 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:06:22 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:06:22 INFO - orchestrator docq started at 2025-02-04 23:06:22\n", + "23:06:22 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0035142898559570312, 'total_file_size': 0.040172576904296875}\n", + "23:06:22 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "23:06:22 INFO - Completed 1 files (20.0%) in 0.0 min\n", + "23:06:22 WARNING - table is empty, skipping processing\n", + "23:06:22 INFO - Completed 2 files (40.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 3 files (60.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 4 files (80.0%) in 0.0 min\n", + "23:06:22 INFO - Completed 5 files (100.0%) in 0.0 min\n", + "23:06:22 INFO - Done processing 5 files, waiting for flush() completion.\n", + "23:06:22 INFO - done flushing in 0.0 sec\n", + "23:06:22 INFO - Completed execution in 0.0 min, execution result 0\n" ] }, { @@ -2022,8 +2022,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 54.2 ms, sys: 4.18 ms, total: 58.3 ms\n", - "Wall time: 53 ms\n" + "CPU times: user 41.8 ms, sys: 1.98 ms, total: 43.7 ms\n", + "Wall time: 36.3 ms\n" ] } ], @@ -2138,7 +2138,7 @@ " 1\n", " 0\n", " 2\n", - " da0c5fcb-38a7-461e-965d-d76b8b759190\n", + " 52b1cdf4-b1ef-4375-8e6b-23f174592c06\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2162,7 +2162,7 @@ " 1\n", " 0\n", " 2\n", - " 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497\n", + " 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2186,7 +2186,7 @@ " 1\n", " 0\n", " 11\n", - " d51af835-96b4-4ba9-b77e-543d964f2d30\n", + " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2210,7 +2210,7 @@ " 1\n", " 0\n", " 11\n", - " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", + " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2246,10 +2246,10 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 da0c5fcb-38a7-461e-965d-d76b8b759190 6571294142213095721 pdf \n", - "1 0accb403-0ab9-4eea-b5c5-f1dcc8d8d497 10026122586747302274 pdf \n", - "2 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", - "3 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", + "0 52b1cdf4-b1ef-4375-8e6b-23f174592c06 6571294142213095721 pdf \n", + "1 854dca5d-9db5-4ea5-b2e5-bddd176bf1b8 10026122586747302274 pdf \n", + "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2376,7 +2376,7 @@ " 1\n", " 0\n", " 11\n", - " d51af835-96b4-4ba9-b77e-543d964f2d30\n", + " 6264e62a-0121-4cd4-8202-ea6e228e15f1\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2400,7 +2400,7 @@ " 1\n", " 0\n", " 11\n", - " ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59\n", + " 582bc53b-96e2-4b09-8dd7-6a27a685a53e\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2432,8 +2432,8 @@ "3 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "2 d51af835-96b4-4ba9-b77e-543d964f2d30 7758129997476962679 pdf \n", - "3 ffbc6bbb-b99d-40c2-964a-6d3e5d92ef59 14711865278795535908 pdf \n", + "2 6264e62a-0121-4cd4-8202-ea6e228e15f1 7758129997476962679 pdf \n", + "3 582bc53b-96e2-4b09-8dd7-6a27a685a53e 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "2 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 717 ... \n", diff --git a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb index 4028e1cc5..04ed0fad4 100644 --- a/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb +++ b/examples/notebooks/pdf-processing-1/pdf_processing_1_ray.ipynb @@ -296,13 +296,13 @@ "source": [ "## Step-3: Inspect the Data\n", "\n", - "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/input/)\n", + "We will use simple PDFs. The files are [here](https://github.com/IBM/data-prep-kit/tree/dev/examples/notebooks/pdf-processing-1/)\n", "\n", - "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth-copy.pdf)\n", - "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", - "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/mars.pdf)\n", - "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/spam.pdf) - contains spammy contents\n", - "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/input/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" + "- [earth.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth.pdf) and exact duplicate [earth-copy.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth-copy.pdf)\n", + "- [earth2.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/earth2.pdf) almost similar to earth.pdf (ONE word difference!)\n", + "- [mars.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/mars.pdf)\n", + "- [spam.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/spam.pdf) - contains spammy contents\n", + "- [lorem-ipsum.pdf](https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/notebooks/pdf-processing-1/lorem-ipsum.pdf) - contains 'lorem ipsum' placeholder\n" ] }, { @@ -330,17 +330,17 @@ "source": [ "if RUNNING_IN_COLAB:\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth.pdf', os.path.join(input_dir, 'earth.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth-copy.pdf', os.path.join(input_dir, 'earth-copy.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/earth2.pdf', os.path.join(input_dir, 'earth2.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/mars.pdf', os.path.join(input_dir, 'mars.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/spam.pdf', os.path.join(input_dir, 'spam.pdf'))\n", "\n", - " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/input/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", + " download_file ('https://raw.githubusercontent.com/sujee/data-prep-kit/process-pdf-1/examples/data-files/pdf-processing-1/lorem-ipsum.pdf', os.path.join(input_dir, 'lorem-ipsum.pdf'))\n", "else:\n", " print ('Using input files from : ', input_dir)" ] @@ -385,35 +385,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:28:42 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "22:28:42 INFO - pipeline id pipeline_id\n", - "22:28:42 INFO - code location None\n", - "22:28:42 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "22:28:42 INFO - actor creation delay 0\n", - "22:28:42 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:28:42 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", - "22:28:42 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:28:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "22:28:42 INFO - Running locally\n", - "2025-02-04 22:28:44,917\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - orchestrator started at 2025-02-04 22:28:48\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.443094636313617, 'object_store': 4.721547316759825}\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:28:48 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=2137264)\u001b[0m 22:28:52 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 32402.35it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=2137265)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:02 INFO - Completed 1 files in 0.036 min\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:02 INFO - Completed 2 files in 0.036 min\n", - "\u001b[36m(RayTransformFileProcessor pid=2137265)\u001b[0m 22:28:52 INFO - Initializing models\n", - "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 44306.03it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=2137264)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:04 INFO - Completed 3 files in 0.07 min\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:04 INFO - Completed 4 files in 0.071 min\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:04 INFO - Completed 4 files (66.667%) in 0.071 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:06 INFO - Completed processing 6 files in 0.098 min\n", - "\u001b[36m(orchestrate pid=2136332)\u001b[0m 22:29:06 INFO - done flushing in 0.001 sec\n", - "22:29:16 INFO - Completed execution in 0.568 min, execution result 0\n" + "23:08:37 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "23:08:37 INFO - pipeline id pipeline_id\n", + "23:08:37 INFO - code location None\n", + "23:08:37 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:08:37 INFO - actor creation delay 0\n", + "23:08:37 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:08:37 INFO - data factory data_ is using local data access: input_folder - ../../data-files/pdf-processing-1/ output_folder - output/01_pdf2pq_out\n", + "23:08:37 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:08:37 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "23:08:37 INFO - Running locally\n", + "2025-02-04 23:08:38,509\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - orchestrator started at 2025-02-04 23:08:42\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Number of files is 6, source profile {'max_file_size': 0.055823326110839844, 'min_file_size': 0.023715972900390625, 'total_file_size': 0.2709054946899414}\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.743361664935946, 'object_store': 4.371680831536651}\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:42 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=2171540)\u001b[0m 23:08:45 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 1688.38it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=2171540)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:54 INFO - Completed 1 files in 0.031 min\n", + "\u001b[36m(RayTransformFileProcessor pid=2171541)\u001b[0m 23:08:45 INFO - Initializing models\n", + "Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 29723.41it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=2171541)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:54 INFO - Completed 2 files in 0.033 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 3 files in 0.062 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 4 files in 0.064 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:56 INFO - Completed 4 files (66.667%) in 0.064 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:58 INFO - Completed processing 6 files in 0.09 min\n", + "\u001b[36m(orchestrate pid=2170644)\u001b[0m 23:08:58 INFO - done flushing in 0.001 sec\n", + "23:09:08 INFO - Completed execution in 0.518 min, execution result 0\n" ] }, { @@ -517,13 +517,13 @@ " 1\n", " 0\n", " 2\n", - " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", + " 3618834f-9dfc-49a1-9066-e2724df95fec\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:29:04.872187\n", - " 1.999260\n", + " 2025-02-04T23:08:56.820444\n", + " 1.846058\n", " lorem-ipsum.pdf\n", " \n", " \n", @@ -533,13 +533,13 @@ " 1\n", " 0\n", " 2\n", - " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", + " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:29:06.547382\n", - " 1.595728\n", + " 2025-02-04T23:08:58.414120\n", + " 1.590731\n", " spam.pdf\n", " \n", " \n", @@ -549,13 +549,13 @@ " 1\n", " 0\n", " 11\n", - " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", + " 84f59118-2a64-4d4b-991c-10ca09576a74\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:29:04.948884\n", - " 2.111419\n", + " 2025-02-04T23:08:56.713495\n", + " 1.827202\n", " earth2.pdf\n", " \n", " \n", @@ -565,13 +565,13 @@ " 1\n", " 0\n", " 11\n", - " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", + " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:29:06.454066\n", - " 1.578924\n", + " 2025-02-04T23:08:58.272496\n", + " 1.547326\n", " mars.pdf\n", " \n", " \n", @@ -581,13 +581,13 @@ " 1\n", " 0\n", " 11\n", - " a7ab12bd-7762-447f-b521-3e8d0d223111\n", + " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:29:02.832786\n", - " 2.139017\n", + " 2025-02-04T23:08:54.872145\n", + " 1.864833\n", " earth-copy.pdf\n", " \n", " \n", @@ -597,13 +597,13 @@ " 1\n", " 0\n", " 11\n", - " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", + " 5b12b0e8-946f-4538-8812-9ee74204c2d7\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:29:02.868823\n", - " 2.174988\n", + " 2025-02-04T23:08:54.969828\n", + " 1.962273\n", " earth.pdf\n", " \n", " \n", @@ -628,12 +628,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", - "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", - "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", - "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", - "4 a7ab12bd-7762-447f-b521-3e8d0d223111 14711865278795535908 pdf \n", - "5 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "5 5b12b0e8-946f-4538-8812-9ee74204c2d7 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -644,12 +644,12 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", - "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", - "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", - "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", - "4 2025-02-04T22:29:02.832786 2.139017 earth-copy.pdf \n", - "5 2025-02-04T22:29:02.868823 2.174988 earth.pdf " + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "5 2025-02-04T23:08:54.969828 1.962273 earth.pdf " ] }, "execution_count": 9, @@ -802,29 +802,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:29:18 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", - "22:29:18 INFO - pipeline id pipeline_id\n", - "22:29:18 INFO - code location None\n", - "22:29:18 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "22:29:18 INFO - actor creation delay 0\n", - "22:29:18 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:29:18 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", - "22:29:18 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:29:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:29:18 INFO - Running locally\n", - "2025-02-04 22:29:19,283\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - orchestrator started at 2025-02-04 22:29:20\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.118695831857622, 'object_store': 4.5593479154631495}\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:20 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 1 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 2 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 3 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 4 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed 4 files (66.667%) in 0.004 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - Completed processing 6 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2138214)\u001b[0m 22:29:21 INFO - done flushing in 0.001 sec\n", - "22:29:31 INFO - Completed execution in 0.228 min, execution result 0\n" + "23:09:09 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'doc_hash', 'int_column': 'int_id_column', 'start_id': 0}\n", + "23:09:09 INFO - pipeline id pipeline_id\n", + "23:09:09 INFO - code location None\n", + "23:09:09 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:09:09 INFO - actor creation delay 0\n", + "23:09:09 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:09 INFO - data factory data_ is using local data access: input_folder - output/01_pdf2pq_out output_folder - output/02_docid_out\n", + "23:09:09 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:09 INFO - Running locally\n", + "2025-02-04 23:09:10,988\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - orchestrator started at 2025-02-04 23:09:12\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Number of files is 6, source profile {'max_file_size': 0.010061264038085938, 'min_file_size': 0.0055408477783203125, 'total_file_size': 0.04969310760498047}\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.800101472064853, 'object_store': 4.400050735101104}\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:12 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - Completed processing 6 files in 0.004 min\n", + "\u001b[36m(orchestrate pid=2172314)\u001b[0m 23:09:13 INFO - done flushing in 0.001 sec\n", + "23:09:23 INFO - Completed execution in 0.226 min, execution result 0\n" ] }, { @@ -832,8 +832,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 126 ms, sys: 136 ms, total: 262 ms\n", - "Wall time: 15 s\n" + "CPU times: user 122 ms, sys: 132 ms, total: 254 ms\n", + "Wall time: 14.8 s\n" ] } ], @@ -934,16 +934,16 @@ " 1\n", " 0\n", " 2\n", - " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", + " 3618834f-9dfc-49a1-9066-e2724df95fec\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:29:04.872187\n", - " 1.999260\n", + " 2025-02-04T23:08:56.820444\n", + " 1.846058\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 2\n", + " 3\n", " \n", " \n", " 1\n", @@ -952,16 +952,16 @@ " 1\n", " 0\n", " 2\n", - " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", + " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:29:06.547382\n", - " 1.595728\n", + " 2025-02-04T23:08:58.414120\n", + " 1.590731\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", - " 4\n", + " 5\n", " \n", " \n", " 2\n", @@ -970,16 +970,16 @@ " 1\n", " 0\n", " 11\n", - " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", + " 84f59118-2a64-4d4b-991c-10ca09576a74\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:29:04.948884\n", - " 2.111419\n", + " 2025-02-04T23:08:56.713495\n", + " 1.827202\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", - " 1\n", + " 2\n", " \n", " \n", " 3\n", @@ -988,16 +988,16 @@ " 1\n", " 0\n", " 11\n", - " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", + " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:29:06.454066\n", - " 1.578924\n", + " 2025-02-04T23:08:58.272496\n", + " 1.547326\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", - " 3\n", + " 4\n", " \n", " \n", " 4\n", @@ -1006,16 +1006,16 @@ " 1\n", " 0\n", " 11\n", - " a7ab12bd-7762-447f-b521-3e8d0d223111\n", + " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:29:02.832786\n", - " 2.139017\n", + " 2025-02-04T23:08:54.872145\n", + " 1.864833\n", " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 0\n", + " 1\n", " \n", " \n", " 5\n", @@ -1024,16 +1024,16 @@ " 1\n", " 0\n", " 11\n", - " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", + " 5b12b0e8-946f-4538-8812-9ee74204c2d7\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:29:02.868823\n", - " 2.174988\n", + " 2025-02-04T23:08:54.969828\n", + " 1.962273\n", " earth.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 5\n", + " 0\n", " \n", " \n", "\n", @@ -1057,12 +1057,12 @@ "5 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", - "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", - "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", - "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", - "4 a7ab12bd-7762-447f-b521-3e8d0d223111 14711865278795535908 pdf \n", - "5 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", + "5 5b12b0e8-946f-4538-8812-9ee74204c2d7 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1073,20 +1073,20 @@ "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", - "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", - "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", - "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", - "4 2025-02-04T22:29:02.832786 2.139017 earth-copy.pdf \n", - "5 2025-02-04T22:29:02.868823 2.174988 earth.pdf \n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", + "5 2025-02-04T23:08:54.969828 1.962273 earth.pdf \n", "\n", " doc_hash int_id_column \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 2 \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 4 \n", - "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 3 \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 \n", - "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 5 " + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 \n", + "5 6140cf695f269a3ddca6568536076756105ad3186086b2... 0 " ] }, "execution_count": 14, @@ -1140,29 +1140,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:29:33 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "22:29:33 INFO - pipeline id pipeline_id\n", - "22:29:33 INFO - code location None\n", - "22:29:33 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "22:29:33 INFO - actor creation delay 0\n", - "22:29:33 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:29:33 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", - "22:29:33 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:29:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:29:33 INFO - Running locally\n", - "2025-02-04 22:29:34,292\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - orchestrator started at 2025-02-04 22:29:35\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.146868897601962, 'object_store': 4.5734344478696585}\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:35 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 4 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2139948)\u001b[0m 22:29:36 INFO - done flushing in 0.001 sec\n", - "22:29:46 INFO - Completed execution in 0.228 min, execution result 0\n" + "23:09:24 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'doc_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "23:09:24 INFO - pipeline id pipeline_id\n", + "23:09:24 INFO - code location None\n", + "23:09:24 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:09:24 INFO - actor creation delay 0\n", + "23:09:24 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:24 INFO - data factory data_ is using local data access: input_folder - output/02_docid_out output_folder - output/03_exact_dedupe_out\n", + "23:09:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:24 INFO - Running locally\n", + "2025-02-04 23:09:25,887\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - orchestrator started at 2025-02-04 23:09:27\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Number of files is 6, source profile {'max_file_size': 0.01116180419921875, 'min_file_size': 0.006641387939453125, 'total_file_size': 0.056290626525878906}\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.715737915597856, 'object_store': 4.357868957333267}\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:27 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 4 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed 4 files (66.667%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2173868)\u001b[0m 23:09:28 INFO - done flushing in 0.001 sec\n", + "23:09:38 INFO - Completed execution in 0.226 min, execution result 0\n" ] }, { @@ -1170,8 +1170,8 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 139 ms, sys: 163 ms, total: 301 ms\n", - "Wall time: 15 s\n" + "CPU times: user 144 ms, sys: 164 ms, total: 308 ms\n", + "Wall time: 14.8 s\n" ] } ], @@ -1275,16 +1275,16 @@ " 1\n", " 0\n", " 2\n", - " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", + " 3618834f-9dfc-49a1-9066-e2724df95fec\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:29:04.872187\n", - " 1.999260\n", + " 2025-02-04T23:08:56.820444\n", + " 1.846058\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 2\n", + " 3\n", " []\n", " \n", " \n", @@ -1294,16 +1294,16 @@ " 1\n", " 0\n", " 2\n", - " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", + " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:29:06.547382\n", - " 1.595728\n", + " 2025-02-04T23:08:58.414120\n", + " 1.590731\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", - " 4\n", + " 5\n", " []\n", " \n", " \n", @@ -1313,16 +1313,16 @@ " 1\n", " 0\n", " 11\n", - " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", + " 84f59118-2a64-4d4b-991c-10ca09576a74\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:29:04.948884\n", - " 2.111419\n", + " 2025-02-04T23:08:56.713495\n", + " 1.827202\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", - " 1\n", + " 2\n", " []\n", " \n", " \n", @@ -1332,35 +1332,35 @@ " 1\n", " 0\n", " 11\n", - " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", + " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:29:06.454066\n", - " 1.578924\n", + " 2025-02-04T23:08:58.272496\n", + " 1.547326\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", - " 3\n", + " 4\n", " []\n", " \n", " \n", " 4\n", - " earth.pdf\n", + " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", + " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:29:02.868823\n", - " 2.174988\n", - " earth.pdf\n", + " 2025-02-04T23:08:54.872145\n", + " 1.864833\n", + " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 5\n", + " 1\n", " []\n", " \n", " \n", @@ -1373,7 +1373,7 @@ "1 spam.pdf Free xxx \n", "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", @@ -1383,11 +1383,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", - "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", - "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", - "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", - "4 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1397,18 +1397,18 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", - "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", - "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", - "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", - "4 2025-02-04T22:29:02.868823 2.174988 earth.pdf \n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 2 [] \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 4 [] \n", - "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 [] \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 3 [] \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 5 [] " + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 [] " ] }, "execution_count": 16, @@ -1486,133 +1486,133 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:29:48 INFO - Starting SignatureCalculation step\n", - "22:29:48 INFO - Got parameters for SignatureCalculation\n", - "22:29:48 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "22:29:48 INFO - data factory scdata_ is using local configuration without input/output path\n", - "22:29:48 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "22:29:48 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:29:48 INFO - pipeline id pipeline_id\n", - "22:29:48 INFO - code location None\n", - "22:29:48 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "22:29:48 INFO - actor creation delay 0\n", - "22:29:48 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:29:48 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "22:29:48 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:29:48 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:29:48 INFO - Running locally\n", - "2025-02-04 22:29:49,424\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - orchestrator started at 2025-02-04 22:29:50\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 9.059674073010683, 'object_store': 4.529837035574019}\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:50 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 1 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 2 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 3 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed 3 files (50.0%) in 0.004 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - Completed processing 6 files in 0.004 min\n", - "\u001b[36m(orchestrate pid=2141678)\u001b[0m 22:29:52 INFO - done flushing in 0.029 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=2142575)\u001b[0m 22:29:52 INFO - Starting flush()\n", - "\u001b[36m(RayTransformFileProcessor pid=2142575)\u001b[0m 22:29:52 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", - "\u001b[36m(RayTransformFileProcessor pid=2142576)\u001b[0m 22:29:52 WARNING - table is empty, skipping processing\n", - "22:30:02 INFO - Completed execution in 0.23 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=2142576)\u001b[0m 22:29:52 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", - "\u001b[36m(RayTransformFileProcessor pid=2142576)\u001b[0m 22:29:52 INFO - Wrote 14 tables with a total size of 6,720 bytes\u001b[32m [repeated 2x across cluster]\u001b[0m\n", - "22:30:03 INFO - SignatureCalculation completed successfully\n", - "22:30:03 INFO - Starting ClusterAnalysis step\n", - "22:30:03 INFO - Got parameters for ClusterAnalysis\n", - "22:30:03 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "22:30:03 INFO - pipeline id pipeline_id\n", - "22:30:03 INFO - code location None\n", - "22:30:03 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "22:30:03 INFO - actor creation delay 0\n", - "22:30:03 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:30:03 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", - "22:30:03 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:30:03 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:30:03 INFO - Running locally\n", - "2025-02-04 22:30:04,465\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - orchestrator started at 2025-02-04 22:30:05\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - Number of folders is 14\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.936753083020449, 'object_store': 4.468376540578902}\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:05 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 2 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 3 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 4 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 5 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 6 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 7 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 8 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 9 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 10 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 11 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - Completed processing 14 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2143303)\u001b[0m 22:30:06 INFO - done flushing in 0.001 sec\n", - "22:30:16 INFO - Completed execution in 0.222 min, execution result 0\n", - "22:30:18 INFO - ClusterAnalysis completed successfully\n", - "22:30:18 INFO - Starting GetDuplicateList step\n", - "22:30:18 INFO - Got parameters for GetDuplicateList\n", - "22:30:18 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "22:30:18 INFO - pipeline id pipeline_id\n", - "22:30:18 INFO - code location None\n", - "22:30:18 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "22:30:18 INFO - actor creation delay 0\n", - "22:30:18 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:30:18 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", - "22:30:18 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:30:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:30:18 INFO - Running locally\n", - "2025-02-04 22:30:18,965\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - orchestrator started at 2025-02-04 22:30:20\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - Number of folders is 1\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.91086196899414, 'object_store': 4.45543098449707}\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:20 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:21 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=2145840)\u001b[0m 22:30:21 INFO - Get Duplicate List for folder docs_to_remove\n", - "\u001b[36m(RayTransformFileProcessor pid=2145840)\u001b[0m 22:30:21 INFO - 0 documents marked as duplicates\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:21 INFO - Completed processing 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=2144976)\u001b[0m 22:30:21 INFO - done flushing in 0.001 sec\n", - "22:30:31 INFO - Completed execution in 0.222 min, execution result 0\n", - "22:30:32 INFO - GetDuplicateList completed successfully\n", - "22:30:32 INFO - Starting DataCleaning step\n", - "22:30:32 INFO - Got parameters for DataCleaning\n", - "22:30:32 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "22:30:32 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "22:30:32 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "22:30:32 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:30:32 INFO - pipeline id pipeline_id\n", - "22:30:32 INFO - code location None\n", - "22:30:32 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "22:30:32 INFO - actor creation delay 0\n", - "22:30:32 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:30:32 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", - "22:30:32 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:30:32 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:30:32 INFO - Running locally\n", - "2025-02-04 22:30:33,492\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - orchestrator started at 2025-02-04 22:30:34\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.05068492889404297}\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.780509186908603, 'object_store': 4.390254592522979}\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:34 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - Completed processing 6 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2146582)\u001b[0m 22:30:36 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=2147461)\u001b[0m 22:30:36 WARNING - table is empty, skipping processing\n", - "22:30:46 INFO - Completed execution in 0.227 min, execution result 0\n", - "22:30:47 INFO - DataCleaning completed successfully\n" + "23:09:39 INFO - Starting SignatureCalculation step\n", + "23:09:39 INFO - Got parameters for SignatureCalculation\n", + "23:09:39 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.9, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "23:09:39 INFO - data factory scdata_ is using local configuration without input/output path\n", + "23:09:39 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "23:09:39 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:39 INFO - pipeline id pipeline_id\n", + "23:09:39 INFO - code location None\n", + "23:09:39 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:09:39 INFO - actor creation delay 0\n", + "23:09:39 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:39 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:09:39 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:39 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:39 INFO - Running locally\n", + "2025-02-04 23:09:40,737\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - orchestrator started at 2025-02-04 23:09:41\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.727081298828125, 'object_store': 4.3635406494140625}\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:41 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2175466)\u001b[0m 23:09:43 INFO - done flushing in 0.026 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 WARNING - table is empty, skipping processing\n", + "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 INFO - Starting flush()\n", + "\u001b[36m(RayTransformFileProcessor pid=2176344)\u001b[0m 23:09:43 INFO - Wrote 14 tables with a total size of 6,720 bytes\n", + "23:09:53 INFO - Completed execution in 0.224 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=2176343)\u001b[0m 23:09:43 INFO - Starting flush()\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", + "\u001b[36m(RayTransformFileProcessor pid=2176343)\u001b[0m 23:09:43 INFO - Wrote 14 tables with a total size of 13,440 bytes\u001b[32m [repeated 2x across cluster]\u001b[0m\n", + "23:09:54 INFO - SignatureCalculation completed successfully\n", + "23:09:54 INFO - Starting ClusterAnalysis step\n", + "23:09:54 INFO - Got parameters for ClusterAnalysis\n", + "23:09:54 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.9, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "23:09:54 INFO - pipeline id pipeline_id\n", + "23:09:54 INFO - code location None\n", + "23:09:54 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:09:54 INFO - actor creation delay 0\n", + "23:09:54 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:09:54 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/bands output_folder - output/04_fuzzy_dedupe_out/docs_to_remove\n", + "23:09:54 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:09:54 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:09:54 INFO - Running locally\n", + "2025-02-04 23:09:55,736\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - orchestrator started at 2025-02-04 23:09:56\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.763642883859575, 'object_store': 4.381821441464126}\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:56 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 7 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 8 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 9 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 10 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 11 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed 11 files (78.571%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - Completed processing 14 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2177032)\u001b[0m 23:09:58 INFO - done flushing in 0.001 sec\n", + "23:10:08 INFO - Completed execution in 0.222 min, execution result 0\n", + "23:10:09 INFO - ClusterAnalysis completed successfully\n", + "23:10:09 INFO - Starting GetDuplicateList step\n", + "23:10:09 INFO - Got parameters for GetDuplicateList\n", + "23:10:09 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "23:10:09 INFO - pipeline id pipeline_id\n", + "23:10:09 INFO - code location None\n", + "23:10:09 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:10:09 INFO - actor creation delay 0\n", + "23:10:09 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:10:09 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out output_folder - output/04_fuzzy_dedupe_out\n", + "23:10:09 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:10:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:09 INFO - Running locally\n", + "2025-02-04 23:10:10,430\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - orchestrator started at 2025-02-04 23:10:11\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.75473709218204, 'object_store': 4.3773685451596975}\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:11 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=2178590)\u001b[0m 23:10:12 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2179461)\u001b[0m 23:10:12 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=2179461)\u001b[0m 23:10:12 INFO - 0 documents marked as duplicates\n", + "23:10:22 INFO - Completed execution in 0.223 min, execution result 0\n", + "23:10:24 INFO - GetDuplicateList completed successfully\n", + "23:10:24 INFO - Starting DataCleaning step\n", + "23:10:24 INFO - Got parameters for DataCleaning\n", + "23:10:24 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "23:10:24 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "23:10:24 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "23:10:24 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:24 INFO - pipeline id pipeline_id\n", + "23:10:24 INFO - code location None\n", + "23:10:24 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "23:10:24 INFO - actor creation delay 0\n", + "23:10:24 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:10:24 INFO - data factory data_ is using local data access: input_folder - output/03_exact_dedupe_out output_folder - output/04_fuzzy_dedupe_out/cleaned\n", + "23:10:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:10:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:24 INFO - Running locally\n", + "2025-02-04 23:10:25,111\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - orchestrator started at 2025-02-04 23:10:26\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Number of files is 6, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.003223419189453125, 'total_file_size': 0.050751686096191406}\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.744503784924746, 'object_store': 4.37225189153105}\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:26 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed 3 files (50.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - Completed processing 6 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2180004)\u001b[0m 23:10:27 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=2180888)\u001b[0m 23:10:27 WARNING - table is empty, skipping processing\n", + "23:10:37 INFO - Completed execution in 0.224 min, execution result 0\n", + "23:10:38 INFO - DataCleaning completed successfully\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 474 ms, sys: 520 ms, total: 994 ms\n", - "Wall time: 59.4 s\n" + "CPU times: user 603 ms, sys: 679 ms, total: 1.28 s\n", + "Wall time: 59.2 s\n" ] } ], @@ -1721,16 +1721,16 @@ " 1\n", " 0\n", " 2\n", - " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", + " 3618834f-9dfc-49a1-9066-e2724df95fec\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", " 35\n", - " 2025-02-04T22:29:04.872187\n", - " 1.999260\n", + " 2025-02-04T23:08:56.820444\n", + " 1.846058\n", " lorem-ipsum.pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", - " 2\n", + " 3\n", " []\n", " \n", " \n", @@ -1740,16 +1740,16 @@ " 1\n", " 0\n", " 2\n", - " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", + " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", " 8\n", - " 2025-02-04T22:29:06.547382\n", - " 1.595728\n", + " 2025-02-04T23:08:58.414120\n", + " 1.590731\n", " spam.pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", - " 4\n", + " 5\n", " []\n", " \n", " \n", @@ -1759,16 +1759,16 @@ " 1\n", " 0\n", " 11\n", - " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", + " 84f59118-2a64-4d4b-991c-10ca09576a74\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", " 610\n", - " 2025-02-04T22:29:04.948884\n", - " 2.111419\n", + " 2025-02-04T23:08:56.713495\n", + " 1.827202\n", " earth2.pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", - " 1\n", + " 2\n", " []\n", " \n", " \n", @@ -1778,35 +1778,35 @@ " 1\n", " 0\n", " 11\n", - " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", + " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", " 717\n", - " 2025-02-04T22:29:06.454066\n", - " 1.578924\n", + " 2025-02-04T23:08:58.272496\n", + " 1.547326\n", " mars.pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", - " 3\n", + " 4\n", " []\n", " \n", " \n", " 4\n", - " earth.pdf\n", + " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", + " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", " 610\n", - " 2025-02-04T22:29:02.868823\n", - " 2.174988\n", - " earth.pdf\n", + " 2025-02-04T23:08:54.872145\n", + " 1.864833\n", + " earth-copy.pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", - " 5\n", + " 1\n", " []\n", " \n", " \n", @@ -1819,7 +1819,7 @@ "1 spam.pdf Free xxx \n", "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", @@ -1829,11 +1829,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", - "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", - "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", - "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", - "4 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", "\n", " hash size \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 \n", @@ -1843,18 +1843,18 @@ "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 610 \n", "\n", " date_acquired pdf_convert_time source_filename \\\n", - "0 2025-02-04T22:29:04.872187 1.999260 lorem-ipsum.pdf \n", - "1 2025-02-04T22:29:06.547382 1.595728 spam.pdf \n", - "2 2025-02-04T22:29:04.948884 2.111419 earth2.pdf \n", - "3 2025-02-04T22:29:06.454066 1.578924 mars.pdf \n", - "4 2025-02-04T22:29:02.868823 2.174988 earth.pdf \n", + "0 2025-02-04T23:08:56.820444 1.846058 lorem-ipsum.pdf \n", + "1 2025-02-04T23:08:58.414120 1.590731 spam.pdf \n", + "2 2025-02-04T23:08:56.713495 1.827202 earth2.pdf \n", + "3 2025-02-04T23:08:58.272496 1.547326 mars.pdf \n", + "4 2025-02-04T23:08:54.872145 1.864833 earth-copy.pdf \n", "\n", " doc_hash int_id_column removed \n", - "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 2 [] \n", - "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 4 [] \n", - "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 1 [] \n", - "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 3 [] \n", - "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 5 [] " + "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 3 [] \n", + "1 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... 5 [] \n", + "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 2 [] \n", + "3 a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... 4 [] \n", + "4 6140cf695f269a3ddca6568536076756105ad3186086b2... 1 [] " ] }, "execution_count": 18, @@ -1918,33 +1918,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "22:30:47 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "22:30:47 INFO - data factory docq_ is using local configuration without input/output path\n", - "22:30:47 INFO - data factory docq_ max_files -1, n_sample -1\n", - "22:30:47 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:30:47 INFO - pipeline id pipeline_id\n", - "22:30:47 INFO - code location None\n", - "22:30:47 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "22:30:47 INFO - actor creation delay 0\n", - "22:30:47 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", - "22:30:47 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", - "22:30:47 INFO - data factory data_ max_files -1, n_sample -1\n", - "22:30:47 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "22:30:47 INFO - Running locally\n", - "2025-02-04 22:30:48,687\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - orchestrator started at 2025-02-04 22:30:49\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.047461509704589844}\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.778627778403461, 'object_store': 4.389313887804747}\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:49 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=2149145)\u001b[0m 22:30:50 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 1 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 2 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 3 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - Completed processing 5 files in 0.003 min\n", - "\u001b[36m(orchestrate pid=2148278)\u001b[0m 22:30:51 INFO - done flushing in 0.001 sec\n", - "22:31:01 INFO - Completed execution in 0.224 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=2149144)\u001b[0m 22:30:50 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" + "23:10:38 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "23:10:38 INFO - data factory docq_ is using local configuration without input/output path\n", + "23:10:38 INFO - data factory docq_ max_files -1, n_sample -1\n", + "23:10:38 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:38 INFO - pipeline id pipeline_id\n", + "23:10:38 INFO - code location None\n", + "23:10:38 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "23:10:38 INFO - actor creation delay 0\n", + "23:10:38 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", + "23:10:38 INFO - data factory data_ is using local data access: input_folder - output/04_fuzzy_dedupe_out/cleaned output_folder - output/05_doc_quality_out\n", + "23:10:38 INFO - data factory data_ max_files -1, n_sample -1\n", + "23:10:38 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "23:10:38 INFO - Running locally\n", + "2025-02-04 23:10:39,863\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - orchestrator started at 2025-02-04 23:10:41\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Number of files is 5, source profile {'max_file_size': 0.011510848999023438, 'min_file_size': 0.0069904327392578125, 'total_file_size': 0.04752826690673828}\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.727170563302934, 'object_store': 4.363585281185806}\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:41 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(RayTransformFileProcessor pid=2182506)\u001b[0m 23:10:41 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 1 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 2 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 3 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed 3 files (60.0%) in 0.003 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - Completed processing 5 files in 0.003 min\n", + "\u001b[36m(orchestrate pid=2181634)\u001b[0m 23:10:42 INFO - done flushing in 0.001 sec\n", + "23:10:52 INFO - Completed execution in 0.223 min, execution result 0\n", + "\u001b[36m(RayTransformFileProcessor pid=2182507)\u001b[0m 23:10:41 INFO - Load badwords found locally from /home/sujee/apps/anaconda3/envs/dpk-6-pdf-processing-r1.0.0-all-py3.11/lib/python3.11/site-packages/dpk_doc_quality/ldnoobw/en\n" ] }, { @@ -1952,8 +1952,8 @@ "output_type": "stream", "text": [ "✅ Stage:5 completed successfully\n", - "CPU times: user 116 ms, sys: 153 ms, total: 269 ms\n", - "Wall time: 14.7 s\n" + "CPU times: user 116 ms, sys: 125 ms, total: 240 ms\n", + "Wall time: 14.8 s\n" ] } ], @@ -2065,7 +2065,7 @@ " 1\n", " 0\n", " 2\n", - " bebe8e77-d5d4-4c43-8073-c8a75b134fdc\n", + " 3618834f-9dfc-49a1-9066-e2724df95fec\n", " 6571294142213095721\n", " pdf\n", " bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5...\n", @@ -2089,7 +2089,7 @@ " 1\n", " 0\n", " 2\n", - " ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98\n", + " 27880888-8e1a-4b46-a6a9-fecba8eee0eb\n", " 10026122586747302274\n", " pdf\n", " 543ffc97aef373ee009a5f908e0358ef80d329ca7ba964...\n", @@ -2113,7 +2113,7 @@ " 1\n", " 0\n", " 11\n", - " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", + " 84f59118-2a64-4d4b-991c-10ca09576a74\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", @@ -2137,7 +2137,7 @@ " 1\n", " 0\n", " 11\n", - " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", + " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2156,12 +2156,12 @@ " \n", " \n", " 4\n", - " earth.pdf\n", + " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", + " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2189,7 +2189,7 @@ "1 spam.pdf Free xxx \n", "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", - "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", " num_pages num_tables num_doc_elements \\\n", "0 1 0 2 \n", @@ -2199,11 +2199,11 @@ "4 1 0 11 \n", "\n", " document_id document_hash ext \\\n", - "0 bebe8e77-d5d4-4c43-8073-c8a75b134fdc 6571294142213095721 pdf \n", - "1 ededf5d3-2d3a-4ce7-83c2-5c72d0aa9c98 10026122586747302274 pdf \n", - "2 8c65dd19-fa9c-47ae-9099-f8daeb568755 10729312978404042321 pdf \n", - "3 2cef2fa6-ffa7-477f-a2f5-323ee68289aa 7758129997476962679 pdf \n", - "4 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 14711865278795535908 pdf \n", + "0 3618834f-9dfc-49a1-9066-e2724df95fec 6571294142213095721 pdf \n", + "1 27880888-8e1a-4b46-a6a9-fecba8eee0eb 10026122586747302274 pdf \n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "0 bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... 35 ... \n", @@ -2326,7 +2326,7 @@ " 1\n", " 0\n", " 11\n", - " 8c65dd19-fa9c-47ae-9099-f8daeb568755\n", + " 84f59118-2a64-4d4b-991c-10ca09576a74\n", " 10729312978404042321\n", " pdf\n", " f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4...\n", @@ -2350,7 +2350,7 @@ " 1\n", " 0\n", " 11\n", - " 2cef2fa6-ffa7-477f-a2f5-323ee68289aa\n", + " 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff\n", " 7758129997476962679\n", " pdf\n", " a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e...\n", @@ -2369,12 +2369,12 @@ " \n", " \n", " 4\n", - " earth.pdf\n", + " earth-copy.pdf\n", " ## Earth\\n\\n## Solar System\\n\\nOur solar syste...\n", " 1\n", " 0\n", " 11\n", - " 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6\n", + " 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44\n", " 14711865278795535908\n", " pdf\n", " 6140cf695f269a3ddca6568536076756105ad3186086b2...\n", @@ -2397,20 +2397,20 @@ "" ], "text/plain": [ - " filename contents num_pages \\\n", - "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... 1 \n", - "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... 1 \n", - "4 earth.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... 1 \n", + " filename contents \\\n", + "2 earth2.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", + "3 mars.pdf ## Mars\\n\\n## Solar System\\n\\nOur solar system... \n", + "4 earth-copy.pdf ## Earth\\n\\n## Solar System\\n\\nOur solar syste... \n", "\n", - " num_tables num_doc_elements document_id \\\n", - "2 0 11 8c65dd19-fa9c-47ae-9099-f8daeb568755 \n", - "3 0 11 2cef2fa6-ffa7-477f-a2f5-323ee68289aa \n", - "4 0 11 41b2e5a9-3d9d-4eb7-980d-f0f22edcb1e6 \n", + " num_pages num_tables num_doc_elements \\\n", + "2 1 0 11 \n", + "3 1 0 11 \n", + "4 1 0 11 \n", "\n", - " document_hash ext \\\n", - "2 10729312978404042321 pdf \n", - "3 7758129997476962679 pdf \n", - "4 14711865278795535908 pdf \n", + " document_id document_hash ext \\\n", + "2 84f59118-2a64-4d4b-991c-10ca09576a74 10729312978404042321 pdf \n", + "3 71ef93eb-66d3-4a21-bb47-cf4b85d4f8ff 7758129997476962679 pdf \n", + "4 7ce227b2-66cb-4e2e-b76b-3a0a8c9d2f44 14711865278795535908 pdf \n", "\n", " hash size ... \\\n", "2 f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... 610 ... \n",