diff --git a/ADVANCED.md b/ADVANCED.md new file mode 100644 index 0000000000..da95bdc5cd --- /dev/null +++ b/ADVANCED.md @@ -0,0 +1,71 @@ + +
TASK: {task}
\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "\n", + "from llm_utils.logging import prep_loggers\n", + "os.environ[\"LLM_LOG_PATH\"] = \"./logs/llm_log.txt\"\n", + "prep_loggers(\"llm=INFO\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The tools in DPK agents are the transforms.\n", + "# Each tool is described as json dictionary with its name, description, input parameters, and how to import it.\n", + "# The list of the tools exists in llm_utils/tools.py file.\n", + "from llm_utils.dpk.tools import *\n", + "print(tools_json)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is an example of a plan for a simple task. It is possed to the prompt to enhance the planning results.\n", + "from llm_utils.dpk.examples import *\n", + "print(example_task)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is a string that contains several constraints on the order of the tools in the plan.\n", + "# It is a free text and can be found in llm_utils/constraints.py file.\n", + "from llm_utils.dpk.constraints import *\n", + "print(constraints)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define LLM models\n", + "\n", + "We have have tested our project with the following LLM execution frameworks: [Watsonx](https://www.ibm.com/watsonx), [Replicate](https://replicate.com/), and locally running [Ollama](https://ollama.com/).\n", + "To use one of the frameworks uncomment its part in the cell below while commenting out the other frameworks.\n", + "Please note that the notebooks have been tested with specific Large Language Models (LLMs) that are mentioned in the cell, and due to the inherent nature of LLMs, using a different model may not produce the same results.\n", + "\n", + "- To use Replicate:\n", + " - Obtain Replicate API token\n", + " - Store the following value in the `.env` file located in your project directory:\n", + " ```\n", + " REPLICATE_API_TOKEN=TASK: {task}
\"\n", + "print_pdf2parquet=f\"PDF2PARQUET Params: {pdf2parquet_params}
\"\n", + "print_doc_chunks=f\"DOC CHUNKS Params: {doc_chunk_params}
\"\n", + "print_doc_id_params=f\"DOC_ID Params: {doc_id_params}
\"\n", + "print_ededup_params=f\"EDEDUP Params: {ededup_params}
\"\n", + "print_text_encoder_params=f\"TEXT_ENCODER Params: {text_encoder_params}
\"\n", + "\n", + "HTML(f\"{print_task}{print_pdf2parquet}{print_doc_chunks}{print_doc_id_params}{print_ededup_params}{print_text_encoder_params}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define LLM models and tools\n", + "\n", + "We have have tested our project with the following LLM execution frameworks: [Watsonx](https://www.ibm.com/watsonx), [Replicate](https://replicate.com/), and locally running [Ollama](https://ollama.com/).\n", + "To use one of the frameworks uncomment its part in the cell below while commenting out the other frameworks.\n", + "Please note that the notebooks have been tested with specific Large Language Models (LLMs) that are mentioned in the cell, and due to the inherent nature of LLMs, using a different model may not produce the same results.\n", + "\n", + "- To use Replicate:\n", + " - Obtain Replicate API token\n", + " - Store the following value in the `.env` file located in your project directory:\n", + " ```\n", + " REPLICATE_API_TOKEN=\n", - " | filename | \n", - "contents | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "document_id | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "|
1 | \n", - "earth.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.84518433, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "|
3 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "|
7 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "
\n", - " | filename | \n", - "contents | \n", - "
---|---|---|
0 | \n", - "mars.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
1 | \n", - "mars.pdf | \n", - "Solar System\\nFor more details about the Solar... | \n", - "
2 | \n", - "mars.pdf | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "
3 | \n", - "mars.pdf | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "
4 | \n", - "earth.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
5 | \n", - "earth.pdf | \n", - "Solar System\\nFor more details about our Solar... | \n", - "
6 | \n", - "earth.pdf | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "
7 | \n", - "earth.pdf | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.84518433, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "4 | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "5 | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "|
3 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "0 | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "|
7 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "removed | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "5 | \n", - "[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "[] | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "[] | \n", - "|
3 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "0 | \n", - "[] | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "[] | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "[] | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "[] | \n", - "
\n", - " | filename | \n", - "contents | \n", - "
---|---|---|
0 | \n", - "mars.pdf | \n", - "Solar System\\nFor more details about the Solar... | \n", - "
1 | \n", - "mars.pdf | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "
2 | \n", - "mars.pdf | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "
3 | \n", - "earth.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
4 | \n", - "earth.pdf | \n", - "Solar System\\nFor more details about our Solar... | \n", - "
5 | \n", - "earth.pdf | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "
6 | \n", - "earth.pdf | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "removed | \n", - "embeddings | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "5 | \n", - "[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... | \n", - "[-0.051861435, 0.0035226212, 0.030617002, 0.04... | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "[] | \n", - "[0.07728295, 0.024970993, -0.043180738, 0.0580... | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:34:44.259545 | \n", - "0.845978 | \n", - "mars.pdf | \n", - "6e9fd08a-a4e2-47da-b5a9-bb1e1a3ab6e2 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "[] | \n", - "[0.10598018, 0.025460618, 0.023627337, 0.03905... | \n", - "|
3 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "0 | \n", - "[] | \n", - "[0.0077404436, -0.02055944, 0.026426593, 0.011... | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "[] | \n", - "[-0.062105548, -0.0053322907, 0.031277698, 0.0... | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "[] | \n", - "[0.072435796, -0.058001805, -0.019771898, -0.0... | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:34:43.410297 | \n", - "0.794765 | \n", - "earth.pdf | \n", - "efbdbcb9-f0af-42f0-b191-2f14ce3ddc7c | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "[] | \n", - "[0.091821924, 0.015197902, 0.07716932, 0.01711... | \n", - "
\n", - " | filename | \n", - "contents | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "document_id | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "|
1 | \n", - "earth.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.84518433, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "|
3 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "|
7 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "
\n", - " | filename | \n", - "contents | \n", - "
---|---|---|
0 | \n", - "mars.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
1 | \n", - "mars.pdf | \n", - "Solar System\\nFor more details about the Solar... | \n", - "
2 | \n", - "mars.pdf | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "
3 | \n", - "mars.pdf | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "
4 | \n", - "earth.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
5 | \n", - "earth.pdf | \n", - "Solar System\\nFor more details about our Solar... | \n", - "
6 | \n", - "earth.pdf | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "
7 | \n", - "earth.pdf | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.84518433, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "4 | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "5 | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "|
3 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "0 | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "|
7 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "removed | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nFor more details about the Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.18510437, 570.83258057, 374.99838257, 581... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "dee4c03474c98efdabbadbcc4ce91138c7820f4ac8ff07... | \n", - "5 | \n", - "[44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf567... | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "[] | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "[] | \n", - "|
3 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.87112427, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "0 | \n", - "[] | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "[] | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "[] | \n", - "|
6 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "[] | \n", - "
\n", - " | filename | \n", - "contents | \n", - "
---|---|---|
0 | \n", - "mars.pdf | \n", - "Solar System\\nFor more details about the Solar... | \n", - "
1 | \n", - "mars.pdf | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "
2 | \n", - "mars.pdf | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "
3 | \n", - "earth.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
4 | \n", - "earth.pdf | \n", - "Solar System\\nFor more details about our Solar... | \n", - "
5 | \n", - "earth.pdf | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "
6 | \n", - "earth.pdf | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_id | \n", - "chunk_hash | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.84518433, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "4 | \n", - "-1 | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "-1 | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "-1 | \n", - "|
3 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "5 | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "-1 | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "-1 | \n", - "
\n", - " | filename | \n", - "contents | \n", - "
---|---|---|
0 | \n", - "mars.pdf | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "
1 | \n", - "mars.pdf | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "
2 | \n", - "mars.pdf | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "
3 | \n", - "earth.pdf | \n", - "Solar System\\nFor more details about our Solar... | \n", - "
4 | \n", - "earth.pdf | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "
5 | \n", - "earth.pdf | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_id | \n", - "chunk_hash | \n", - "embeddings | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Solar System\\nOur solar system is a vast and f... | \n", - "$.main-text[2] | \n", - "1 | \n", - "[132.84518433, 588.96014404, 479.40917969, 623... | \n", - "44c6e373258c7cdc03f75a8e96a9b160f9aa4e4baf5674... | \n", - "4 | \n", - "-1 | \n", - "[0.0077404897, -0.020559434, 0.026426662, 0.01... | \n", - "|
1 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Mars\\nMars, the fourth planet from the Sun, is... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.87440491, 500.84011841, 477.48345947, 534... | \n", - "a31663e06fac41470ecc459f5a58658a3f9997d7801053... | \n", - "6 | \n", - "-1 | \n", - "[0.07728298, 0.024971062, -0.04318075, 0.05809... | \n", - "|
2 | \n", - "mars.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "8edd5dfbf888777120b528a5d8998f2757d006df0eaef7... | \n", - "2800 | \n", - "2024-10-18T13:30:59.490007 | \n", - "2.011138 | \n", - "mars.pdf | \n", - "62e5639f-f922-4ccc-a041-3cb02f1cfd83 | \n", - "Basic facts about Mars:\\n· Distance from the S... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.2026062, 482.90710449, 237.04431152, 493.... | \n", - "7ff317954ec5f3b15607c053c30c2b0db0f6b64cc3295a... | \n", - "7 | \n", - "-1 | \n", - "[0.1059802, 0.025460616, 0.02362733, 0.0390564... | \n", - "|
3 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Solar System\\nFor more details about our Solar... | \n", - "$.main-text[3] | \n", - "1 | \n", - "[133.20942688, 570.81555176, 375.57919312, 581... | \n", - "d7be13d7dee96cf2384072d0eb01981e0e75eec2e7bc6d... | \n", - "1 | \n", - "5 | \n", - "[-0.062105577, -0.0053322953, 0.03127779, 0.04... | \n", - "|
4 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nEarth is the third planet from the Sun.... | \n", - "$.main-text[5] | \n", - "1 | \n", - "[132.91053772, 512.46295166, 477.84887695, 534... | \n", - "7c4a750e2215f231803a6f8078bde1e9699034fb033dd3... | \n", - "2 | \n", - "-1 | \n", - "[0.0724358, -0.058001805, -0.01977186, -0.0243... | \n", - "|
5 | \n", - "earth.pdf | \n", - "1 | \n", - "0 | \n", - "11 | \n", - "18713f970989055625bef22209b6f4b6830b9ca22046bf... | \n", - "2686 | \n", - "2024-10-18T13:30:59.494027 | \n", - "2.015123 | \n", - "earth.pdf | \n", - "f3c0ac2e-1de2-472b-8216-2043f3b3e9d1 | \n", - "Earth\\nBasic facts about Earth:\\n· Distance fr... | \n", - "$.main-text[6] | \n", - "1 | \n", - "[133.30151367, 494.86206055, 240.17156982, 505... | \n", - "189a221704d17feeb96b1b1ef60a2a2445459848cd8e8f... | \n", - "3 | \n", - "-1 | \n", - "[0.091821924, 0.015197907, 0.07716932, 0.01711... | \n", - "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T13:54:32.155384 | \n", + "0.651216 | \n", + "lorem-ipsum.pdf | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "2bd06750-cb70-4689-b2b8-72913b929a1d | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T13:54:33.440651 | \n", + "0.617823 | \n", + "spam.pdf | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "594034db-1fcd-411b-a89e-d37e4defdfc2 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T13:54:31.502460 | \n", + "0.645348 | \n", + "earth2.pdf | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T13:54:32.821365 | \n", + "0.664288 | \n", + "mars.pdf | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T13:54:29.909555 | \n", + "1.100482 | \n", + "earth-copy.pdf | \n", + "|
5 | \n", + "earth.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "d1d30fbc-c1e9-4813-a067-085e50b4ee49 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T13:54:30.855225 | \n", + "0.931613 | \n", + "earth.pdf | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "doc_hash | \n", + "int_id_column | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T13:54:32.155384 | \n", + "0.651216 | \n", + "lorem-ipsum.pdf | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "3 | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "2bd06750-cb70-4689-b2b8-72913b929a1d | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T13:54:33.440651 | \n", + "0.617823 | \n", + "spam.pdf | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "5 | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "594034db-1fcd-411b-a89e-d37e4defdfc2 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T13:54:31.502460 | \n", + "0.645348 | \n", + "earth2.pdf | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "2 | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T13:54:32.821365 | \n", + "0.664288 | \n", + "mars.pdf | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "4 | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T13:54:29.909555 | \n", + "1.100482 | \n", + "earth-copy.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "0 | \n", + "|
5 | \n", + "earth.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "d1d30fbc-c1e9-4813-a067-085e50b4ee49 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T13:54:30.855225 | \n", + "0.931613 | \n", + "earth.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "1 | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "doc_hash | \n", + "int_id_column | \n", + "removed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T13:54:32.155384 | \n", + "0.651216 | \n", + "lorem-ipsum.pdf | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "3 | \n", + "[] | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "2bd06750-cb70-4689-b2b8-72913b929a1d | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T13:54:33.440651 | \n", + "0.617823 | \n", + "spam.pdf | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "5 | \n", + "[] | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "594034db-1fcd-411b-a89e-d37e4defdfc2 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T13:54:31.502460 | \n", + "0.645348 | \n", + "earth2.pdf | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "2 | \n", + "[] | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T13:54:32.821365 | \n", + "0.664288 | \n", + "mars.pdf | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "4 | \n", + "[] | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T13:54:29.909555 | \n", + "1.100482 | \n", + "earth-copy.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "0 | \n", + "[] | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "doc_hash | \n", + "int_id_column | \n", + "removed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T13:54:32.155384 | \n", + "0.651216 | \n", + "lorem-ipsum.pdf | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "3 | \n", + "[] | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "2bd06750-cb70-4689-b2b8-72913b929a1d | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T13:54:33.440651 | \n", + "0.617823 | \n", + "spam.pdf | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "5 | \n", + "[] | \n", + "|
2 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T13:54:32.821365 | \n", + "0.664288 | \n", + "mars.pdf | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "4 | \n", + "[] | \n", + "|
3 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T13:54:29.909555 | \n", + "1.100482 | \n", + "earth-copy.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "0 | \n", + "[] | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "... | \n", + "docq_mean_word_len | \n", + "docq_symbol_to_word_ratio | \n", + "docq_sentence_count | \n", + "docq_lorem_ipsum_ratio | \n", + "docq_curly_bracket_ratio | \n", + "docq_contain_bad_word | \n", + "docq_bullet_point_ratio | \n", + "docq_ellipsis_line_ratio | \n", + "docq_alphabet_word_ratio | \n", + "docq_contain_common_en_words | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "4be2a61e-96f5-4f58-bf6f-e829dbdfa9d3 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "... | \n", + "5.000000 | \n", + "0.000000 | \n", + "1 | \n", + "0.085714 | \n", + "0.0 | \n", + "False | \n", + "0.000000 | \n", + "0.0 | \n", + "1.000000 | \n", + "False | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "2bd06750-cb70-4689-b2b8-72913b929a1d | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "... | \n", + "3.500000 | \n", + "0.000000 | \n", + "1 | \n", + "0.000000 | \n", + "0.0 | \n", + "True | \n", + "0.000000 | \n", + "0.0 | \n", + "1.000000 | \n", + "False | \n", + "|
2 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "... | \n", + "4.688000 | \n", + "0.032000 | \n", + "8 | \n", + "0.000000 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880000 | \n", + "True | \n", + "|
3 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "... | \n", + "4.541284 | \n", + "0.027523 | \n", + "9 | \n", + "0.000000 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880734 | \n", + "True | \n", + "
4 rows × 27 columns
\n", + "\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "... | \n", + "docq_mean_word_len | \n", + "docq_symbol_to_word_ratio | \n", + "docq_sentence_count | \n", + "docq_lorem_ipsum_ratio | \n", + "docq_curly_bracket_ratio | \n", + "docq_contain_bad_word | \n", + "docq_bullet_point_ratio | \n", + "docq_ellipsis_line_ratio | \n", + "docq_alphabet_word_ratio | \n", + "docq_contain_common_en_words | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "20ae1424-c2c3-436f-a7ff-b8c69fa3a3c3 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "... | \n", + "4.688000 | \n", + "0.032000 | \n", + "8 | \n", + "0.0 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880000 | \n", + "True | \n", + "|
3 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "4b43fb09-c9ef-4d9a-af24-8e22b5ff33b3 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "... | \n", + "4.541284 | \n", + "0.027523 | \n", + "9 | \n", + "0.0 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880734 | \n", + "True | \n", + "
2 rows × 27 columns
\n", + "\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "8dc8970e-215a-44fe-a7bf-946c03f36c60 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T14:19:29.408910 | \n", + "1.912304 | \n", + "lorem-ipsum.pdf | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "9ac78463-b325-406b-891e-c9e84722eb34 | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T14:19:30.986464 | \n", + "1.573836 | \n", + "spam.pdf | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "b3ed1942-54a6-49fc-bcbc-2d8c438adef3 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T14:19:29.335271 | \n", + "1.850426 | \n", + "earth2.pdf | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "6d882651-2506-41cb-8704-85575c64b143 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T14:19:30.950673 | \n", + "1.612200 | \n", + "mars.pdf | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "f8ccec16-576c-4e3e-8bec-359dff01d6d2 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T14:19:27.470409 | \n", + "2.071769 | \n", + "earth-copy.pdf | \n", + "|
5 | \n", + "earth.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "18d940f3-f4b4-46ac-9147-077675aead1d | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T14:19:27.492574 | \n", + "2.093768 | \n", + "earth.pdf | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "doc_hash | \n", + "int_id_column | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "8dc8970e-215a-44fe-a7bf-946c03f36c60 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T14:19:29.408910 | \n", + "1.912304 | \n", + "lorem-ipsum.pdf | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "3 | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "9ac78463-b325-406b-891e-c9e84722eb34 | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T14:19:30.986464 | \n", + "1.573836 | \n", + "spam.pdf | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "5 | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "b3ed1942-54a6-49fc-bcbc-2d8c438adef3 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T14:19:29.335271 | \n", + "1.850426 | \n", + "earth2.pdf | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "2 | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "6d882651-2506-41cb-8704-85575c64b143 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T14:19:30.950673 | \n", + "1.612200 | \n", + "mars.pdf | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "4 | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "f8ccec16-576c-4e3e-8bec-359dff01d6d2 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T14:19:27.470409 | \n", + "2.071769 | \n", + "earth-copy.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "1 | \n", + "|
5 | \n", + "earth.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "18d940f3-f4b4-46ac-9147-077675aead1d | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T14:19:27.492574 | \n", + "2.093768 | \n", + "earth.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "0 | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "doc_hash | \n", + "int_id_column | \n", + "removed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "8dc8970e-215a-44fe-a7bf-946c03f36c60 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T14:19:29.408910 | \n", + "1.912304 | \n", + "lorem-ipsum.pdf | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "3 | \n", + "[] | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "9ac78463-b325-406b-891e-c9e84722eb34 | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T14:19:30.986464 | \n", + "1.573836 | \n", + "spam.pdf | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "5 | \n", + "[] | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "b3ed1942-54a6-49fc-bcbc-2d8c438adef3 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T14:19:29.335271 | \n", + "1.850426 | \n", + "earth2.pdf | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "2 | \n", + "[] | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "6d882651-2506-41cb-8704-85575c64b143 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T14:19:30.950673 | \n", + "1.612200 | \n", + "mars.pdf | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "4 | \n", + "[] | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "f8ccec16-576c-4e3e-8bec-359dff01d6d2 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T14:19:27.470409 | \n", + "2.071769 | \n", + "earth-copy.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "1 | \n", + "[] | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "doc_hash | \n", + "int_id_column | \n", + "removed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "8dc8970e-215a-44fe-a7bf-946c03f36c60 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "2025-02-06T14:19:29.408910 | \n", + "1.912304 | \n", + "lorem-ipsum.pdf | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "3 | \n", + "[] | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "9ac78463-b325-406b-891e-c9e84722eb34 | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "2025-02-06T14:19:30.986464 | \n", + "1.573836 | \n", + "spam.pdf | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "5 | \n", + "[] | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "b3ed1942-54a6-49fc-bcbc-2d8c438adef3 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "2025-02-06T14:19:29.335271 | \n", + "1.850426 | \n", + "earth2.pdf | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "2 | \n", + "[] | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "6d882651-2506-41cb-8704-85575c64b143 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "2025-02-06T14:19:30.950673 | \n", + "1.612200 | \n", + "mars.pdf | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "4 | \n", + "[] | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "f8ccec16-576c-4e3e-8bec-359dff01d6d2 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "2025-02-06T14:19:27.470409 | \n", + "2.071769 | \n", + "earth-copy.pdf | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "1 | \n", + "[] | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "... | \n", + "docq_mean_word_len | \n", + "docq_symbol_to_word_ratio | \n", + "docq_sentence_count | \n", + "docq_lorem_ipsum_ratio | \n", + "docq_curly_bracket_ratio | \n", + "docq_contain_bad_word | \n", + "docq_bullet_point_ratio | \n", + "docq_ellipsis_line_ratio | \n", + "docq_alphabet_word_ratio | \n", + "docq_contain_common_en_words | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "lorem-ipsum.pdf | \n", + "Lorem ipsum Lorem ipsum Lorem ipsum | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "8dc8970e-215a-44fe-a7bf-946c03f36c60 | \n", + "6571294142213095721 | \n", + "bc012d063005cc02deb6c2592d1f8c3b273625edf9eec5... | \n", + "35 | \n", + "... | \n", + "5.000000 | \n", + "0.000000 | \n", + "1 | \n", + "0.085714 | \n", + "0.0 | \n", + "False | \n", + "0.000000 | \n", + "0.0 | \n", + "1.000000 | \n", + "False | \n", + "|
1 | \n", + "spam.pdf | \n", + "Free xxx | \n", + "1 | \n", + "0 | \n", + "2 | \n", + "9ac78463-b325-406b-891e-c9e84722eb34 | \n", + "10026122586747302274 | \n", + "543ffc97aef373ee009a5f908e0358ef80d329ca7ba964... | \n", + "8 | \n", + "... | \n", + "3.500000 | \n", + "0.000000 | \n", + "1 | \n", + "0.000000 | \n", + "0.0 | \n", + "True | \n", + "0.000000 | \n", + "0.0 | \n", + "1.000000 | \n", + "False | \n", + "|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "b3ed1942-54a6-49fc-bcbc-2d8c438adef3 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "... | \n", + "4.541284 | \n", + "0.027523 | \n", + "9 | \n", + "0.000000 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880734 | \n", + "True | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "6d882651-2506-41cb-8704-85575c64b143 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "... | \n", + "4.688000 | \n", + "0.032000 | \n", + "8 | \n", + "0.000000 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880000 | \n", + "True | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "f8ccec16-576c-4e3e-8bec-359dff01d6d2 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "... | \n", + "4.541284 | \n", + "0.027523 | \n", + "9 | \n", + "0.000000 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880734 | \n", + "True | \n", + "
5 rows × 27 columns
\n", + "\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "... | \n", + "docq_mean_word_len | \n", + "docq_symbol_to_word_ratio | \n", + "docq_sentence_count | \n", + "docq_lorem_ipsum_ratio | \n", + "docq_curly_bracket_ratio | \n", + "docq_contain_bad_word | \n", + "docq_bullet_point_ratio | \n", + "docq_ellipsis_line_ratio | \n", + "docq_alphabet_word_ratio | \n", + "docq_contain_common_en_words | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | \n", + "earth2.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "b3ed1942-54a6-49fc-bcbc-2d8c438adef3 | \n", + "10729312978404042321 | \n", + "f039191d59ce8ba25023a844f9b99e7ef2ea4bf75a23f4... | \n", + "610 | \n", + "... | \n", + "4.541284 | \n", + "0.027523 | \n", + "9 | \n", + "0.0 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880734 | \n", + "True | \n", + "|
3 | \n", + "mars.pdf | \n", + "## Mars\\n\\n## Solar System\\n\\nOur solar system... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "6d882651-2506-41cb-8704-85575c64b143 | \n", + "7758129997476962679 | \n", + "a3a4bb3b8f4f441d6d669e09f0cd07a9420d06850cf63e... | \n", + "717 | \n", + "... | \n", + "4.688000 | \n", + "0.032000 | \n", + "8 | \n", + "0.0 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880000 | \n", + "True | \n", + "|
4 | \n", + "earth-copy.pdf | \n", + "## Earth\\n\\n## Solar System\\n\\nOur solar syste... | \n", + "1 | \n", + "0 | \n", + "11 | \n", + "f8ccec16-576c-4e3e-8bec-359dff01d6d2 | \n", + "14711865278795535908 | \n", + "6140cf695f269a3ddca6568536076756105ad3186086b2... | \n", + "610 | \n", + "... | \n", + "4.541284 | \n", + "0.027523 | \n", + "9 | \n", + "0.0 | \n", + "0.0 | \n", + "False | \n", + "0.176471 | \n", + "0.0 | \n", + "0.880734 | \n", + "True | \n", + "
3 rows × 27 columns
\n", + "\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "attention.pdf | \n", + "Provided proper attribution is provided, Googl... | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "178f709f-cd23-4bad-957a-5e8a88c9af22 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-17T18:15:44.573338 | \n", + "13.146994 | \n", + "attention.pdf | \n", + "|
1 | \n", + "granite2.pdf | \n", + "## Granite Code Models: A Family of Open Found... | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "758f58b8-eaba-444a-b348-d45194a1c2e6 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-17T18:16:40.700160 | \n", + "28.162055 | \n", + "granite2.pdf | \n", + "|
2 | \n", + "granite.pdf | \n", + "## Granite Code Models: A Family of Open Found... | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "c19d4b3e-c045-4823-8814-43da8808d68d | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-17T18:16:12.497813 | \n", + "27.883452 | \n", + "granite.pdf | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "removed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", + "granite.pdf | \n", + "## Granite Code Models: A Family of Open Found... | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "c19d4b3e-c045-4823-8814-43da8808d68d | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-17T18:16:12.497813 | \n", + "27.883452 | \n", + "granite.pdf | \n", + "[] | \n", + "|
0 | \n", + "attention.pdf | \n", + "Provided proper attribution is provided, Googl... | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "178f709f-cd23-4bad-957a-5e8a88c9af22 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-17T18:15:44.573338 | \n", + "13.146994 | \n", + "attention.pdf | \n", + "[] | \n", + "
\n", + " | filename | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "removed | \n", + "source_document_id | \n", + "contents | \n", + "document_id | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | \n", + "granite.pdf | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-17T18:16:12.497813 | \n", + "27.883452 | \n", + "granite.pdf | \n", + "[] | \n", + "c19d4b3e-c045-4823-8814-43da8808d68d | \n", + "## 6.1.5 RepoBench, CrossCodeEval: Repository-... | \n", + "63337c6952e14044ce448bb0dc6a369181b7779cffcd92... | \n", + "|
35 | \n", + "granite.pdf | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-17T18:16:12.497813 | \n", + "27.883452 | \n", + "granite.pdf | \n", + "[] | \n", + "c19d4b3e-c045-4823-8814-43da8808d68d | \n", + "## 3 Model Architecture\\n\\nWe train a series o... | \n", + "b0ad58f3ab8f7e69f2460a6713bf65396737cb179cc374... | \n", + "|
22 | \n", + "attention.pdf | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-17T18:15:44.573338 | \n", + "13.146994 | \n", + "attention.pdf | \n", + "[] | \n", + "178f709f-cd23-4bad-957a-5e8a88c9af22 | \n", + "## 6.2 Model Variations\\n\\nTo evaluate the imp... | \n", + "60de5803d0837ef01773367a79da7c3e47fe90bec09ecb... | \n", + "
\n", + " | filename | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "removed | \n", + "source_document_id | \n", + "contents | \n", + "document_id | \n", + "embeddings | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | \n", + "attention.pdf | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-17T18:15:44.573338 | \n", + "13.146994 | \n", + "attention.pdf | \n", + "[] | \n", + "178f709f-cd23-4bad-957a-5e8a88c9af22 | \n", + "## Abstract\\n\\nThe dominant sequence transduct... | \n", + "590629323f9d88598a80846d1df6a83d0ad6ac53efe278... | \n", + "[-0.08771476, -0.12373961, 0.043168165, 0.0060... | \n", + "|
18 | \n", + "attention.pdf | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-17T18:15:44.573338 | \n", + "13.146994 | \n", + "attention.pdf | \n", + "[] | \n", + "178f709f-cd23-4bad-957a-5e8a88c9af22 | \n", + "## 5.3 Optimizer\\n\\nWe used the Adam optimizer... | \n", + "47fc3dca18355f0f161c953a7ad213eaa8c33da0be6875... | \n", + "[-0.0124165565, -0.04576251, 0.037190527, -0.0... | \n", + "|
21 | \n", + "attention.pdf | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-17T18:15:44.573338 | \n", + "13.146994 | \n", + "attention.pdf | \n", + "[] | \n", + "178f709f-cd23-4bad-957a-5e8a88c9af22 | \n", + "## 6.1 Machine Translation\\n\\nOn the WMT 2014 ... | \n", + "b7aa340533889effd73d129e0c14083277031c44becfa6... | \n", + "[-0.037983608, -0.067570895, -0.000437462, 0.0... | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "attention.pdf | \n", + "Provided proper attribution is provided, Googl... | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "0677706d-c587-4ddc-a52d-7ed12b082cbe | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-19T22:48:56.992519 | \n", + "48.361864 | \n", + "attention.pdf | \n", + "|
1 | \n", + "granite2.pdf | \n", + "## Granite Code Models: A Family of Open Found... | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "ab9c2476-0e95-4b0e-84c9-1efab49761de | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:51:42.875845 | \n", + "165.833343 | \n", + "granite2.pdf | \n", + "|
2 | \n", + "granite.pdf | \n", + "## Granite Code Models: A Family of Open Found... | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "20953ad1-8227-4454-8b20-c412f5520185 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:50:57.694515 | \n", + "169.037999 | \n", + "granite.pdf | \n", + "
\n", + " | filename | \n", + "contents | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_id | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "removed | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "attention.pdf | \n", + "Provided proper attribution is provided, Googl... | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "0677706d-c587-4ddc-a52d-7ed12b082cbe | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-19T22:48:56.992519 | \n", + "48.361864 | \n", + "attention.pdf | \n", + "[] | \n", + "|
1 | \n", + "granite.pdf | \n", + "## Granite Code Models: A Family of Open Found... | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "20953ad1-8227-4454-8b20-c412f5520185 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:50:57.694515 | \n", + "169.037999 | \n", + "granite.pdf | \n", + "[] | \n", + "
\n", + " | filename | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "removed | \n", + "source_document_id | \n", + "contents | \n", + "document_id | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", + "attention.pdf | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-19T22:48:56.992519 | \n", + "48.361864 | \n", + "attention.pdf | \n", + "[] | \n", + "0677706d-c587-4ddc-a52d-7ed12b082cbe | \n", + "## Attention Is All You Need\\n\\nAshish Vaswani... | \n", + "45e678f43369d5fa127105b7cca6a6e4dd4deed6422185... | \n", + "|
58 | \n", + "granite.pdf | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:50:57.694515 | \n", + "169.037999 | \n", + "granite.pdf | \n", + "[] | \n", + "20953ad1-8227-4454-8b20-c412f5520185 | \n", + "## References\\n\\nWasi Uddin Ahmad, Md Golam Ra... | \n", + "b787f46ab644038e472b9815a122eead379ed7f37a3d4f... | \n", + "|
52 | \n", + "granite.pdf | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:50:57.694515 | \n", + "169.037999 | \n", + "granite.pdf | \n", + "[] | \n", + "20953ad1-8227-4454-8b20-c412f5520185 | \n", + "## 6.4 Code Reasoning, Understanding and Execu... | \n", + "1c7f5e76a2aaad73f5f03549b065016b0703239538839d... | \n", + "
\n", + " | filename | \n", + "num_pages | \n", + "num_tables | \n", + "num_doc_elements | \n", + "document_hash | \n", + "ext | \n", + "hash | \n", + "size | \n", + "date_acquired | \n", + "pdf_convert_time | \n", + "source_filename | \n", + "removed | \n", + "source_document_id | \n", + "contents | \n", + "document_id | \n", + "embeddings | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
44 | \n", + "granite.pdf | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:50:57.694515 | \n", + "169.037999 | \n", + "granite.pdf | \n", + "[] | \n", + "20953ad1-8227-4454-8b20-c412f5520185 | \n", + "## 6.1.1 HumanEvalSynthesize: Multilingual Cod... | \n", + "b10bcf46720fb7fff15818c4bc03ec37ae84181e6cbbc1... | \n", + "[-0.03851807, 0.00934296, 0.02425409, -0.00439... | \n", + "|
15 | \n", + "attention.pdf | \n", + "15 | \n", + "6 | \n", + "147 | \n", + "2949302674760005271 | \n", + "f1f600333e46c5d7e23f5a110a903ee38aab0bf7047eca... | \n", + "46040 | \n", + "2025-01-19T22:48:56.992519 | \n", + "48.361864 | \n", + "attention.pdf | \n", + "[] | \n", + "0677706d-c587-4ddc-a52d-7ed12b082cbe | \n", + "## 5 Training\\n\\nThis section describes the tr... | \n", + "7e7dce074e6995e9c9551e1349cad58153b319c45e20a1... | \n", + "[-0.02469791, -0.077463716, 0.07508141, 0.0363... | \n", + "|
40 | \n", + "granite.pdf | \n", + "28 | \n", + "19 | \n", + "295 | \n", + "3127757990743433032 | \n", + "0534b6a29ab9cedf21c3f6cf756cf0252d19a8e9135a41... | \n", + "127678 | \n", + "2025-01-19T22:50:57.694515 | \n", + "169.037999 | \n", + "granite.pdf | \n", + "[] | \n", + "20953ad1-8227-4454-8b20-c412f5520185 | \n", + "## 4.4 Infrastructure\\n\\nWe train the Granite ... | \n", + "81f51d21fd61607d8aa9bb50925c2bc936fa1da7b27f4b... | \n", + "[-0.033672214, -0.01862875, 0.0034308454, 0.06... | \n", + "
\n", - " | filename | \n", - "contents | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "document_id | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "granite.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 | \n", - "0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... | \n", - "654989 | \n", - "2024-10-02T00:24:48.959612 | \n", - "34.223920 | \n", - "granite.pdf | \n", - "|
1 | \n", - "attension.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
87 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... | \n", - "654989 | \n", - "2024-10-02T00:24:48.959612 | \n", - "34.223920 | \n", - "granite.pdf | \n", - "4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 | \n", - "6.3 Code Editing and Translation\\nTable 12: Pa... | \n", - "$.main-text[189] | \n", - "16 | \n", - "[106.69820404, 190.24554443, 504.00320435, 211... | \n", - "f28d8c9a4fe81f0baf801daf9a95ddaf152a4ac5e8b8ac... | \n", - "|
154 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "3.2.2 Multi-Head Attention\\nMulti-head attenti... | \n", - "$.main-text[55] | \n", - "5 | \n", - "[107.46644592, 669.41210938, 503.99703979, 690... | \n", - "da79f02a5f19c2f07de7a6f1da9df8db00f01a477582ac... | \n", - "|
67 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... | \n", - "654989 | \n", - "2024-10-02T00:24:48.959612 | \n", - "34.223920 | \n", - "granite.pdf | \n", - "4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 | \n", - "6.1.5 RepoBench, CrossCodeEval: Repository-Lev... | \n", - "$.main-text[153] | \n", - "12 | \n", - "[106.97065735, 224.31654358, 505.74191284, 290... | \n", - "cd5bd4537bde007298a91de7fa2fb4b56516d2f1d31262... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
192 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "6.2 Model Variations\\nIn Table 3 rows (A), we ... | \n", - "$.main-text[118] | \n", - "9 | \n", - "[107.27760315, 318.93438721, 505.24127197, 350... | \n", - "70948f748c6f275b39c70652e29d60dfd53c545e0d6d92... | \n", - "70948f748c6f275b39c70652e29d60dfd53c545e0d6d92... | \n", - "69 | \n", - "|
71 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... | \n", - "654989 | \n", - "2024-10-02T00:24:48.959612 | \n", - "34.223920 | \n", - "granite.pdf | \n", - "4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 | \n", - "6.1.5 RepoBench, CrossCodeEval: Repository-Lev... | \n", - "$.tables[7] | \n", - "13 | \n", - "[109.39778137, 486.89639282, 502.1010437, 679.... | \n", - "b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52... | \n", - "b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52... | \n", - "159 | \n", - "|
196 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "6.3 English Constituency Parsing\\nWe performed... | \n", - "$.main-text[123] | \n", - "9 | \n", - "[106.96768951, 69.592453, 504.24859619, 101.62... | \n", - "93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5... | \n", - "93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5... | \n", - "73 | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "removed | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
194 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "6.3 English Constituency Parsing\\nTo evaluate ... | \n", - "$.main-text[121] | \n", - "9 | \n", - "[107.15766144, 167.93530273, 504.10968018, 210... | \n", - "10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f... | \n", - "10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f... | \n", - "71 | \n", - "[] | \n", - "|
101 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... | \n", - "654989 | \n", - "2024-10-02T00:24:48.959612 | \n", - "34.223920 | \n", - "granite.pdf | \n", - "4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 | \n", - "6.5 Math Reasoning\\nTable 15: Performance on 4... | \n", - "$.main-text[219] | \n", - "19 | \n", - "[118.49487305, 699.65753174, 492.17700195, 710... | \n", - "c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852... | \n", - "c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852... | \n", - "189 | \n", - "[] | \n", - "|
206 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "7 Conclusion\\nAcknowledgements We are grateful... | \n", - "$.main-text[135] | \n", - "10 | \n", - "[107.4437561, 212.26509094, 504.00241089, 232.... | \n", - "855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4... | \n", - "855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4... | \n", - "83 | \n", - "[] | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "removed | \n", - "embeddings | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
193 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "6.2 Model Variations\\nIn Table 3 rows (B), we ... | \n", - "$.main-text[119] | \n", - "9 | \n", - "[107.44257355, 248.49208069, 505.24127197, 312... | \n", - "6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522... | \n", - "6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522... | \n", - "70 | \n", - "[] | \n", - "[-0.0049973284, -0.10789071, 0.02143236, -0.02... | \n", - "|
210 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:24:14.713654 | \n", - "18.004455 | \n", - "attension.pdf | \n", - "f275d75a-a072-4836-8a55-6a65f0d34577 | \n", - "Attention Visualizations Input-Input Layer5\\nF... | \n", - "$.main-text[190] | \n", - "15 | \n", - "[107.43354034, 157.36341858, 504.06988525, 189... | \n", - "67626adb815bf2b27871df24d538ddc10ae68a3fbbd238... | \n", - "67626adb815bf2b27871df24d538ddc10ae68a3fbbd238... | \n", - "87 | \n", - "[] | \n", - "[0.01508544, -0.015680796, 0.039181348, 0.0084... | \n", - "|
46 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... | \n", - "654989 | \n", - "2024-10-02T00:24:48.959612 | \n", - "34.223920 | \n", - "granite.pdf | \n", - "4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 | \n", - "6.1.1 HumanEvalSynthesize: Multilingual Code G... | \n", - "$.main-text[117] | \n", - "9 | \n", - "[107.46860504, 613.84277344, 456.97003174, 624... | \n", - "3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba... | \n", - "3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba... | \n", - "134 | \n", - "[] | \n", - "[-0.029933447, 0.031515192, -0.04598905, -0.01... | \n", - "
\n", - " | filename | \n", - "contents | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "document_id | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "granite.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "|
1 | \n", - "attension.pdf | \n", - "{\"_name\":\"\",\"type\":\"pdf-document\",\"description... | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
185 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "6.1 Machine Translation\\nOn the WMT 2014 Engli... | \n", - "$.main-text[108] | \n", - "8 | \n", - "[107.27262115, 260.13467407, 505.24533081, 302... | \n", - "d6c1d3686219a176bc5ff0ebf4f5c82a53d95d1502d476... | \n", - "|
94 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "6.3 Code Editing and Translation\\nFrom Table 1... | \n", - "$.main-text[199] | \n", - "17 | \n", - "[107.33219147, 356.5696106, 505.74539185, 411.... | \n", - "1c841522286ea1348acafd3a4cfbbffd327ca5de53c5f9... | \n", - "|
175 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "5.1 Training Data and Batching\\nWe trained on ... | \n", - "$.main-text[91] | \n", - "7 | \n", - "[107.12083435, 343.05245972, 505.65435791, 418... | \n", - "77de84b7743b8360a371146c12c9795a12984ef82354f4... | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
31 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "3 Model Architecture\\nremove final 8 layers fr... | \n", - "$.main-text[69] | \n", - "6 | \n", - "[107.45430756, 456.21582031, 504.50476074, 521... | \n", - "72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9... | \n", - "72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9... | \n", - "119 | \n", - "|
116 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "Acknowledgments\\nThanks and acknowledgement to... | \n", - "$.main-text[249] | \n", - "21 | \n", - "[107.07092285, 59.12960052, 505.24591064, 160.... | \n", - "b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84... | \n", - "b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84... | \n", - "204 | \n", - "|
95 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "6.3 Code Editing and Translation\\nCodeLingua (... | \n", - "$.main-text[200] | \n", - "17 | \n", - "[107.03813934, 207.6650238, 505.74505615, 350.... | \n", - "c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3... | \n", - "c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3... | \n", - "183 | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_hash | \n", - "chunk_id | \n", - "removed | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
188 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "6.2 Model Variations\\nTo evaluate the importan... | \n", - "$.main-text[112] | \n", - "8 | \n", - "[107.1419754, 91.9256134, 504.05615234, 113.59... | \n", - "6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f... | \n", - "6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f... | \n", - "65 | \n", - "[] | \n", - "|
153 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "3.2.2 Multi-Head Attention\\noutput values. The... | \n", - "$.main-text[54] | \n", - "5 | \n", - "[107.36427307, 696.97607422, 503.99719238, 717... | \n", - "07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b... | \n", - "07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b... | \n", - "30 | \n", - "[] | \n", - "|
68 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "6.1.5 RepoBench, CrossCodeEval: Repository-Lev... | \n", - "$.main-text[154] | \n", - "12 | \n", - "[107.21151733, 141.59487915, 505.73928833, 218... | \n", - "650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd... | \n", - "650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd... | \n", - "156 | \n", - "[] | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_id | \n", - "removed | \n", - "chunk_hash | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
47 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "6.1.1 HumanEvalSynthesize: Multilingual Code G... | \n", - "$.main-text[118] | \n", - "9 | \n", - "[107.09940338, 505.84005737, 505.70474243, 604... | \n", - "22dd65548755f19ec6ccd89020fd1fbc88e339fafbd881... | \n", - "135 | \n", - "[] | \n", - "-1 | \n", - "|
134 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "1 Introduction\\nAttention mechanisms have beco... | \n", - "$.main-text[20] | \n", - "2 | \n", - "[107.17721558, 497.6980896, 505.65536499, 540.... | \n", - "362722af4a10ed54ca21fd329149c01397a621e15f8306... | \n", - "11 | \n", - "[] | \n", - "-1 | \n", - "|
93 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "6.3 Code Editing and Translation\\nTarget Langu... | \n", - "$.tables[13] | \n", - "17 | \n", - "[161.45388794, 433.6942749, 450.61630249, 552.... | \n", - "f665c10385f0eb31b2b94e5e61c934651f5789f5ab528c... | \n", - "181 | \n", - "[] | \n", - "-1 | \n", - "
\n", - " | filename | \n", - "num_pages | \n", - "num_tables | \n", - "num_doc_elements | \n", - "ext | \n", - "hash | \n", - "size | \n", - "date_acquired | \n", - "pdf_convert_time | \n", - "source_filename | \n", - "source_document_id | \n", - "contents | \n", - "doc_jsonpath | \n", - "page_number | \n", - "bbox | \n", - "document_id | \n", - "chunk_id | \n", - "removed | \n", - "chunk_hash | \n", - "embeddings | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
171 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "4 Why Self-Attention\\nlength n is smaller than... | \n", - "$.main-text[85] | \n", - "7 | \n", - "[107.26034546, 652.83349609, 504.29177856, 717... | \n", - "6f8efa86e0a4f77b0d72d4a3141e5e0611b2921a392b99... | \n", - "48 | \n", - "[] | \n", - "-1 | \n", - "[0.018015103, -0.038851, 0.0016827772, -0.0493... | \n", - "|
25 | \n", - "granite.pdf | \n", - "28 | \n", - "17 | \n", - "348 | \n", - "79c53d694df467391e94f279af2fa6a9a7e45c3922546e... | \n", - "655054 | \n", - "2024-10-02T00:28:23.836369 | \n", - "167.768806 | \n", - "granite.pdf | \n", - "81bc331a-69cf-49bd-84b9-afedcab1344a | \n", - "3 Model Architecture\\nBatch size, 3B = 2048. B... | \n", - "$.tables[0] | \n", - "5 | \n", - "[138.25450134, 299.99499512, 471.55078125, 432... | \n", - "b8f3a83c697e885ad31913c716644399a4772691e39d0b... | \n", - "113 | \n", - "[] | \n", - "-1 | \n", - "[0.003977602, -0.06122852, -0.089708336, -0.00... | \n", - "|
137 | \n", - "attension.pdf | \n", - "15 | \n", - "4 | \n", - "193 | \n", - "6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... | \n", - "135814 | \n", - "2024-10-02T00:26:29.888597 | \n", - "53.822026 | \n", - "attension.pdf | \n", - "7afd3fbc-3a9f-4728-8fd8-4a9a13980244 | \n", - "2 Background\\nSelf-attention, sometimes called... | \n", - "$.main-text[24] | \n", - "2 | \n", - "[107.29702759, 256.18237305, 505.24960327, 298... | \n", - "9c2abd2ec38b67c74873e0cd670d27b702711d05930f26... | \n", - "14 | \n", - "[] | \n", - "-1 | \n", - "[0.03394238, -0.0117239505, -0.03349689, -0.02... | \n", - "
cma:readability$ make venv PYTHON=python3.11 cma:readability$ source venv/bin/activate -(venv) cma:readability$ python -m dpk_readability.runtime --data_local_config "{ 'input_folder': 'test-data/input', 'output_folder': 'output' }" -12:07:23 INFO - Launching Readability transform -12:07:23 INFO - Readability parameters are : {'readability_contents_column_name': 'contents', 'readability_curriculum': False} -12:07:23 INFO - pipeline id pipeline_id -12:07:23 INFO - code location None -12:07:23 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output -12:07:23 INFO - data factory data_ max_files -1, n_sample -1 -12:07:23 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet'] -12:07:23 INFO - orchestrator readability started at 2025-01-28 12:07:23 -12:07:23 INFO - Number of files is 1, source profile {'max_file_size': 0.014194488525390625, 'min_file_size': 0.014194488525390625, 'total_file_size': 0.014194488525390625} -12:07:23 INFO - Completed 1 files (100.0%) in 0.002 min -12:07:23 INFO - Done processing 1 files, waiting for flush() completion. -12:07:23 INFO - done flushing in 0.0 sec -12:07:23 INFO - Completed execution in 0.003 min, execution result 0 +(venv) cma:readability$ python -m dpk_readability.runtime --data_local_config "{ 'input_folder': 'test-data/input', 'output_folder': 'output' }" --readability_score_list "['reading_time_textstat','spache_readability_textstat','text_standard_textstat']" +13:07:23 INFO - Launching Readability transform +13:07:23 INFO - Readability parameters are : {'readability_contents_column_name': 'contents', 'readability_score_list': ['reading_time_textstat', 'spache_readability_textstat', 'text_standard_textstat']} +13:07:23 INFO - pipeline id pipeline_id +13:07:23 INFO - code location None +13:07:23 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output +13:07:23 INFO - data factory data_ max_files -1, n_sample -1 +13:07:23 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet'] +13:07:23 INFO - orchestrator readability started at 2025-02-07 13:07:23 +13:07:23 INFO - Number of files is 1, source profile {'max_file_size': 0.014194488525390625, 'min_file_size': 0.014194488525390625, 'total_file_size': 0.014194488525390625} +13:07:24 INFO - Completed 1 files (100.0%) in 0.002 min +13:07:24 INFO - Done processing 1 files, waiting for flush() completion. +13:07:24 INFO - done flushing in 0.0 sec +13:07:24 INFO - Completed execution in 0.002 min, execution result 0 (venv) cma:readability$ deactivate@@ -134,8 +140,8 @@ options: -h, --help show this help message and exit --readability_contents_column_name READABILITY_CONTENTS_COLUMN_NAME contents column name for input parquet table to transform - --readability_curriculum READABILITY_CURRICULUM - curriculum parameter for transform; select True for curriculum learning + --readability_score_list READABILITY_SCORE_LIST + list of readability scores to be computed by the transform; valid values: {'flesch_ease_textstat', 'reading_time_textstat', 'flesch_kincaid_textstat', 'automated_readability_index_textstat', 'linsear_write_formula_textstat', 'text_standard_textstat', 'smog_index_textstat', 'difficult_words_textstat', 'spache_readability_textstat', 'dale_chall_readability_score_textstat', 'mcalpine_eflaw_textstat', 'gunning_fog_textstat', 'coleman_liau_index_textstat'} --data_s3_cred DATA_S3_CRED AST string of options for s3 credentials. Only required for S3 data access. access_key: access key help text @@ -181,3 +187,4 @@ options: path: Path within the repository Example: { 'github': 'https://github.com/somerepo', 'commit_hash': '1324', 'path': 'transforms/universal/code' } + diff --git a/transforms/language/readability/dpk_readability/common.py b/transforms/language/readability/dpk_readability/common.py index 352e11b1f8..d19e6c9dfa 100644 --- a/transforms/language/readability/dpk_readability/common.py +++ b/transforms/language/readability/dpk_readability/common.py @@ -63,12 +63,10 @@ """Key holds the mcalpine_eflaw_textstat R score threshold parameter""" reading_time_textstat = "reading_time_textstat" """Key holds the reading_time_textstat R score threshold parameter""" -avg_grade_level = "avg_grade_level" -"""Key holds the avg_grade_level R score threshold parameter""" contents_column_name = "contents_column_name" """Contents column name for the input parquet table to the transform""" -curriculum = "curriculum" -"""curriculum parameter for transform; either True or False""" +score_list = "score_list" +"""list of readability scores to be computed by the transform""" ######################################################################################## @@ -76,12 +74,12 @@ """avg_grade_level R score threshold parameter""" contents_column_name_cli_param = f"{cli_prefix}{contents_column_name}" """Content column name for parquet input table to transform""" -curriculum_cli_param = f"{cli_prefix}{curriculum}" -"""curriculum parameter for transform; either True or False""" +score_list_cli_param = f"{cli_prefix}{score_list}" +"""list of readability scores or a single readability scores to be computed by the transform""" # The set of default value that can be overwritten from the CLI """ contents_column_name_default = "contents" """The default value for contents_column_name""" -curriculum_default = False -"""curriculum parameter for transform; either True or False""" +score_list_default = mcalpine_eflaw_textstat +"""readability score that is computed by default""" diff --git a/transforms/language/readability/dpk_readability/runtime.py b/transforms/language/readability/dpk_readability/runtime.py index 62d03c6906..018c27daaa 100644 --- a/transforms/language/readability/dpk_readability/runtime.py +++ b/transforms/language/readability/dpk_readability/runtime.py @@ -10,6 +10,8 @@ # limitations under the License. ################################################################################ +import argparse +import ast import sys from argparse import ArgumentParser, Namespace @@ -21,12 +23,25 @@ from data_processing.transform import TransformConfiguration from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger, str2bool from dpk_readability.common import ( + automated_readability_index_textstat, cli_prefix, + coleman_liau_index_textstat, contents_column_name_cli_param, contents_column_name_default, - curriculum_cli_param, - curriculum_default, + dale_chall_readability_score_textstat, + difficult_words_textstat, + flesch_ease_textstat, + flesch_kincaid_textstat, + gunning_fog_textstat, + linsear_write_formula_textstat, + mcalpine_eflaw_textstat, + reading_time_textstat, + score_list_cli_param, + score_list_default, short_name, + smog_index_textstat, + spache_readability_textstat, + text_standard_textstat, ) from dpk_readability.transform import ReadabilityTransform @@ -54,6 +69,33 @@ def add_input_params(self, parser: ArgumentParser) -> None: By convention a common prefix should be used for all transform-specific CLI args (e.g, noop_, pii_, etc.) """ + valid_values = { + flesch_ease_textstat, + flesch_kincaid_textstat, + gunning_fog_textstat, + smog_index_textstat, + coleman_liau_index_textstat, + automated_readability_index_textstat, + dale_chall_readability_score_textstat, + difficult_words_textstat, + linsear_write_formula_textstat, + text_standard_textstat, + spache_readability_textstat, + mcalpine_eflaw_textstat, + reading_time_textstat, + } + + def validate_scores(x): + if x.startswith("[") and x.endswith("]"): + scores = ast.literal_eval(x) + if not all(score in valid_values for score in scores): + raise argparse.ArgumentTypeError(f"Invalid scores in list. Allowed scores: {valid_values}") + return scores + elif x in valid_values: + return x + else: + raise argparse.ArgumentTypeError(f"Invalid score: {x}. Allowed scores: {valid_values}") + parser.add_argument( f"--{contents_column_name_cli_param}", type=str, @@ -61,12 +103,13 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=contents_column_name_default, help="contents column name for input parquet table to transform", ) + parser.add_argument( - f"--{curriculum_cli_param}", - type=lambda x: bool(str2bool(x)), + f"--{score_list_cli_param}", + type=validate_scores, required=False, - default=curriculum_default, - help="curriculum parameter for transform; select True for curriculum learning", + default=score_list_default, + help=f"list of readability scores to be computed by the transform; valid values: {valid_values}", ) def apply_input_params(self, args: Namespace) -> bool: diff --git a/transforms/language/readability/dpk_readability/transform.py b/transforms/language/readability/dpk_readability/transform.py index 73d97b72cb..dcab7c306c 100644 --- a/transforms/language/readability/dpk_readability/transform.py +++ b/transforms/language/readability/dpk_readability/transform.py @@ -10,20 +10,18 @@ # limitations under the License. ################################################################################ -from typing import Any +from typing import Any, Callable +import polars as pl import pyarrow as pa import textstat from data_processing.transform import AbstractTableTransform from data_processing.utils import get_logger from dpk_readability.common import ( automated_readability_index_textstat, - avg_grade_level, coleman_liau_index_textstat, contents_column_name_cli_param, contents_column_name_default, - curriculum_cli_param, - curriculum_default, dale_chall_readability_score_textstat, difficult_words_textstat, flesch_ease_textstat, @@ -32,6 +30,8 @@ linsear_write_formula_textstat, mcalpine_eflaw_textstat, reading_time_textstat, + score_list_cli_param, + score_list_default, smog_index_textstat, spache_readability_textstat, text_standard_textstat, @@ -49,134 +49,123 @@ class ReadabilityTransform(AbstractTableTransform): def __init__(self, config: dict): super().__init__(config) self.contents_column_name = config.get(contents_column_name_cli_param, contents_column_name_default) - self.curriculum = config.get(curriculum_cli_param, curriculum_default) + self.score_list = config.get(score_list_cli_param, score_list_default) + if isinstance(self.score_list, str): + self.score_list = [self.score_list] def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """transform function for readability_scores""" - pq_df_new = table.to_pandas() - - if self.curriculum: - ######## This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. - pq_df_new[flesch_kincaid_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.flesch_kincaid_grade(x) - ) - - ######## This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. - pq_df_new[gunning_fog_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.gunning_fog(x) - ) - - ######## Returns the ARI (Automated Readability Index) which outputs a number that approximates the grade level needed to comprehend the text. For example if the ARI is 6.5, then the grade level to comprehend the text is 6th to 7th grade. - pq_df_new[automated_readability_index_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.automated_readability_index(x) - ) - - ######## Average of all grade_level metrics - # pq_df_new['avg_grade_level'] = pq_df_new[['flesch_kincaid_textstat', 'gunning_fog_textstat', 'coleman_liau_index_textstat', 'automated_readability_index_textstat', 'dale_chall_readability_score_textstat', 'linsear_write_formula_textstat']].mean(axis=1) - ######## R83_avg_GradeL - pq_df_new[avg_grade_level] = pq_df_new[ - [flesch_kincaid_textstat, gunning_fog_textstat, automated_readability_index_textstat] - ].mean(axis=1) - - ######## Returns a score for the readability of an english text for a foreign learner or English, focusing on the number of miniwords and length of sentences. It is recommended to aim for a score equal to or lower than 25. Further reading on blog https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/ - pq_df_new[mcalpine_eflaw_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.mcalpine_eflaw(x) - ) - else: - ######### textstat Readability Scores - ######### Score School level (US) Notes - ######### 100.00–90.00 5th grade Very easy to read. Easily understood by an average 11-year-old student. - ######### 90.0–80.0 6th grade Easy to read. Conversational English for consumers. - ######### 80.0–70.0 7th grade Fairly easy to read. - ######### 70.0–60.0 8th & 9th grade Plain English. Easily understood by 13- to 15-year-old students. - ######### 60.0–50.0 10th to 12th grade Fairly difficult to read. - ######### 50.0–30.0 College Difficult to read. - ######### 30.0–10.0 College graduate Very difficult to read. Best understood by university graduates. - ######### 10.0–0.0 Professional Extremely difficult to read. Best understood by university graduates. - ######## While the maximum score is 121.22, there is no limit on how low the score can be. A negative score is valid. - pq_df_new[flesch_ease_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.flesch_reading_ease(x) - ) - - ######## This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. - pq_df_new[flesch_kincaid_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.flesch_kincaid_grade(x) - ) - - ######## This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. - pq_df_new[gunning_fog_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.gunning_fog(x) - ) - - ######## Returns the SMOG index of the given text. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. Texts of fewer than 30 sentences are statistically invalid, because the SMOG formula was normed on 30-sentence samples. textstat requires at least 3 sentences for a result. - pq_df_new[smog_index_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.smog_index(x) - ) - - ######## Returns the grade level of the text using the Coleman-Liau Formula. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. - pq_df_new[coleman_liau_index_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.coleman_liau_index(x) - ) - - ######## Returns the ARI (Automated Readability Index) which outputs a number that approximates the grade level needed to comprehend the text. For example if the ARI is 6.5, then the grade level to comprehend the text is 6th to 7th grade. - pq_df_new[automated_readability_index_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.automated_readability_index(x) - ) - - ######## Different from other tests, since it uses a lookup table of the most commonly used 3000 English words. Thus it returns the grade level using the New Dale-Chall Formula. Further reading on https://en.wikipedia.org/wiki/Dale–Chall_readability_formula - ######### Score Understood by - ######### 4.9 or lower average 4th-grade student or lower - ######### 5.0–5.9 average 5th or 6th-grade student - ######### 6.0–6.9 average 7th or 8th-grade student - ######### 7.0–7.9 average 9th or 10th-grade student - ######### 8.0–8.9 average 11th or 12th-grade student - ######### 9.0–9.9 average 13th to 15th-grade (college) student - pq_df_new[dale_chall_readability_score_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.dale_chall_readability_score(x) - ) - - ######## No explanation - pq_df_new[difficult_words_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.difficult_words(x) - ) - - ######## Returns the grade level using the Linsear Write Formula. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. Further reading on Wikipedia https://en.wikipedia.org/wiki/Linsear_Write - pq_df_new[linsear_write_formula_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.linsear_write_formula(x) - ) - - ######## Based upon all the above tests, returns the estimated school grade level required to understand the text. Optional float_output allows the score to be returned as a float. Defaults to False. - pq_df_new[text_standard_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.text_standard(x, float_output=True) - ) - - ######## Returns grade level of english text. Intended for text written for children up to grade four. - ######## Further reading on https://en.wikipedia.org/wiki/Spache_readability_formula - pq_df_new[spache_readability_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.spache_readability(x) - ) - - ######## Returns a score for the readability of an english text for a foreign learner or English, focusing on the number of miniwords and length of sentences. It is recommended to aim for a score equal to or lower than 25. Further reading on blog https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/ - pq_df_new[mcalpine_eflaw_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.mcalpine_eflaw(x) - ) - - ######## Returns the reading time of the given text. Assumes 14.69ms per character. - ######## Further reading in Thttps://homepages.inf.ed.ac.uk/keller/papers/cognition08a.pdf - pq_df_new[reading_time_textstat] = pq_df_new[self.contents_column_name].apply( - lambda x: textstat.reading_time(x) - ) - - ######## Average of all grade_level metrics - # pq_df_new['avg_grade_level'] = pq_df_new[['flesch_kincaid_textstat', 'gunning_fog_textstat', 'coleman_liau_index_textstat', 'automated_readability_index_textstat', 'dale_chall_readability_score_textstat', 'linsear_write_formula_textstat']].mean(axis=1) - ######## R83_avg_GradeL - pq_df_new[avg_grade_level] = pq_df_new[ - [flesch_kincaid_textstat, gunning_fog_textstat, automated_readability_index_textstat] - ].mean(axis=1) - - output_table = pa.Table.from_pandas(pq_df_new) + df = pl.from_arrow(table) + + ######### textstat Readability Scores + ######### Score School level (US) Notes + ######### 100.00–90.00 5th grade Very easy to read. Easily understood by an average 11-year-old student. + ######### 90.0–80.0 6th grade Easy to read. Conversational English for consumers. + ######### 80.0–70.0 7th grade Fairly easy to read. + ######### 70.0–60.0 8th & 9th grade Plain English. Easily understood by 13- to 15-year-old students. + ######### 60.0–50.0 10th to 12th grade Fairly difficult to read. + ######### 50.0–30.0 College Difficult to read. + ######### 30.0–10.0 College graduate Very difficult to read. Best understood by university graduates. + ######### 10.0–0.0 Professional Extremely difficult to read. Best understood by university graduates. + ######## While the maximum score is 121.22, there is no limit on how low the score can be. A negative score is valid. + + df = self._add_textstat_column( + df, self.contents_column_name, textstat.flesch_reading_ease, flesch_ease_textstat + ) + + ######## This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. + df = self._add_textstat_column( + df, self.contents_column_name, textstat.flesch_kincaid_grade, flesch_kincaid_textstat + ) + + ######## This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. + df = self._add_textstat_column(df, self.contents_column_name, textstat.gunning_fog, gunning_fog_textstat) + + ######## Returns the SMOG index of the given text. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. Texts of fewer than 30 sentences are statistically invalid, because the SMOG formula was normed on 30-sentence samples. textstat requires at least 3 sentences for a result. + df = self._add_textstat_column(df, self.contents_column_name, textstat.smog_index, smog_index_textstat) + + ######## Returns the grade level of the text using the Coleman-Liau Formula. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. + df = self._add_textstat_column( + df, self.contents_column_name, textstat.coleman_liau_index, coleman_liau_index_textstat + ) + + ######## Returns the ARI (Automated Readability Index) which outputs a number that approximates the grade level needed to comprehend the text. For example if the ARI is 6.5, then the grade level to comprehend the text is 6th to 7th grade. + df = self._add_textstat_column( + df, self.contents_column_name, textstat.automated_readability_index, automated_readability_index_textstat + ) + + ######## Different from other tests, since it uses a lookup table of the most commonly used 3000 English words. Thus it returns the grade level using the New Dale-Chall Formula. Further reading on https://en.wikipedia.org/wiki/Dale–Chall_readability_formula + ######### Score Understood by + ######### 4.9 or lower average 4th-grade student or lower + ######### 5.0–5.9 average 5th or 6th-grade student + ######### 6.0–6.9 average 7th or 8th-grade student + ######### 7.0–7.9 average 9th or 10th-grade student + ######### 8.0–8.9 average 11th or 12th-grade student + ######### 9.0–9.9 average 13th to 15th-grade (college) student + df = self._add_textstat_column( + df, self.contents_column_name, textstat.dale_chall_readability_score, dale_chall_readability_score_textstat + ) + + ######## No explanation + df = self._add_textstat_column( + df, self.contents_column_name, textstat.difficult_words, difficult_words_textstat + ) + + ######## Returns the grade level using the Linsear Write Formula. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. Further reading on Wikipedia https://en.wikipedia.org/wiki/Linsear_Write + df = self._add_textstat_column( + df, self.contents_column_name, textstat.linsear_write_formula, linsear_write_formula_textstat + ) + + ######## Based upon all the above tests, returns the estimated school grade level required to understand the text. Optional float_output allows the score to be returned as a float. Defaults to False. + df = self._add_textstat_column( + df, self.contents_column_name, textstat.text_standard, text_standard_textstat, float_output=True + ) + + ######## Returns grade level of english text. Intended for text written for children up to grade four. + ######## Further reading on https://en.wikipedia.org/wiki/Spache_readability_formula + df = self._add_textstat_column( + df, self.contents_column_name, textstat.spache_readability, spache_readability_textstat + ) + + ######## Returns a score for the readability of an english text for a foreign learner or English, focusing on the number of miniwords and length of sentences. It is recommended to aim for a score equal to or lower than 25. Further reading on blog https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/ + df = self._add_textstat_column(df, self.contents_column_name, textstat.mcalpine_eflaw, mcalpine_eflaw_textstat) + + ######## Returns the reading time of the given text. Assumes 14.69ms per character. + ######## Further reading in Thttps://homepages.inf.ed.ac.uk/keller/papers/cognition08a.pdf + df = self._add_textstat_column(df, self.contents_column_name, textstat.reading_time, reading_time_textstat) + + # output_table = pa.Table.from_pandas(pq_df_new) + output_table = df.to_arrow() metadata = {"nrows": len(output_table)} logger.debug(f"Transformed one table with {len(output_table)} rows") return [output_table], metadata + + def _add_textstat_column( + self, + df: pl.DataFrame, + text_column: str, + stat_func: Callable, + new_column_name: str, + **kwargs: Any, + ) -> pl.DataFrame: + """ + Adds a new column to the Polars DataFrame by applying a textstat function to a text column. + The function executes only if the textstat score identified in the new_column_name exists + in the self.score_list variable + + :param df: The input Polars DataFrame + :param text_column: The name of the text column + :param stat_func: A textstat function to apply + :param new_column_name: The name of the new column + :return: A new DataFrame with the additional computed column + """ + if new_column_name in self.score_list: + return df.with_columns( + df[text_column] + .map_elements(lambda x: stat_func(x, **kwargs), return_dtype=pl.Float64) + .alias(new_column_name) + ) + else: + return df diff --git a/transforms/language/readability/readability_python.ipynb b/transforms/language/readability/readability_python.ipynb index 51c73569e8..58a7d76e79 100644 --- a/transforms/language/readability/readability_python.ipynb +++ b/transforms/language/readability/readability_python.ipynb @@ -56,7 +56,7 @@ "| input_folder:str | \\${PWD}/test-data/input/ | folder that contains the input parquet files for the extreme tokenized algorithm |\n", "| output_folder:str | \\${PWD}/output/ | folder that contains the all the intermediate results and the output parquet files for the extreme tokenized algorithm |\n", "| readability_contents_column_name:str | text | name of the column that stores document text |\n", - "| readability_curriculum:str | False | curriculum parameter for transform; either True or False |" + "| readability_score_list:Union[str, list[str]] | mcalpine_eflaw_textstat | list of readability scores or a single readability scores to be computed by the transform |" ] }, { @@ -69,18 +69,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "11:49:27 INFO - Readability parameters are : {'contents_column_name': 'contents', 'curriculum': False}\n", - "11:49:27 INFO - pipeline id pipeline_id\n", - "11:49:27 INFO - code location None\n", - "11:49:27 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", - "11:49:27 INFO - data factory data_ max_files -1, n_sample -1\n", - "11:49:27 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "11:49:27 INFO - orchestrator readability started at 2025-01-23 11:49:27\n", - "11:49:27 INFO - Number of files is 1, source profile {'max_file_size': 0.014194488525390625, 'min_file_size': 0.014194488525390625, 'total_file_size': 0.014194488525390625}\n", - "11:49:27 INFO - Completed 1 files (100.0%) in 0.003 min\n", - "11:49:27 INFO - Done processing 1 files, waiting for flush() completion.\n", - "11:49:27 INFO - done flushing in 0.0 sec\n", - "11:49:27 INFO - Completed execution in 0.003 min, execution result 0\n" + "19:29:24 INFO - Readability parameters are : {'readability_contents_column_name': 'contents', 'readability_score_list': ['mcalpine_eflaw_textstat']}\n", + "19:29:24 INFO - pipeline id pipeline_id\n", + "19:29:24 INFO - code location None\n", + "19:29:24 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "19:29:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "19:29:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "19:29:24 INFO - orchestrator readability started at 2025-02-10 19:29:24\n", + "19:29:24 INFO - Number of files is 1, source profile {'max_file_size': 0.014194488525390625, 'min_file_size': 0.014194488525390625, 'total_file_size': 0.014194488525390625}\n", + "19:29:25 INFO - Completed 1 files (100.0%) in 0.006 min\n", + "19:29:25 INFO - Done processing 1 files, waiting for flush() completion.\n", + "19:29:25 INFO - done flushing in 0.0 sec\n", + "19:29:25 INFO - Completed execution in 0.006 min, execution result 0\n" ] }, { @@ -99,7 +99,7 @@ " input_folder=\"test-data/input\",\n", " output_folder=\"output\",\n", " readability_contents_column_name=\"contents\",\n", - " readability_curriculum=False,\n", + " readability_score_list=[\"mcalpine_eflaw_textstat\"],\n", ").transform()\n" ] }, @@ -432,3089 +432,334 @@ "name": "stdout", "output_type": "stream", "text": [ - "shape: (2, 16)\n", - "┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬────────┐\n", - "│ con ┆ id ┆ fle ┆ fle ┆ gun ┆ smo ┆ col ┆ aut ┆ dal ┆ dif ┆ lin ┆ tex ┆ spa ┆ mca ┆ rea ┆ avg_gr │\n", - "│ ten ┆ --- ┆ sch ┆ sch ┆ nin ┆ g_i ┆ ema ┆ oma ┆ e_c ┆ fic ┆ sea ┆ t_s ┆ che ┆ lpi ┆ din ┆ ade_le │\n", - "│ ts ┆ str ┆ _ea ┆ _ki ┆ g_f ┆ nde ┆ n_l ┆ ted ┆ hal ┆ ult ┆ r_w ┆ tan ┆ _re ┆ ne_ ┆ g_t ┆ vel │\n", - "│ --- ┆ ┆ se_ ┆ nca ┆ og_ ┆ x_t ┆ iau ┆ _re ┆ l_r ┆ _wo ┆ rit ┆ dar ┆ ada ┆ efl ┆ ime ┆ --- │\n", - "│ str ┆ ┆ tex ┆ id_ ┆ tex ┆ ext ┆ _in ┆ ada ┆ ead ┆ rds ┆ e_f ┆ d_t ┆ bil ┆ aw_ ┆ _te ┆ f64 │\n", - "│ ┆ ┆ tst ┆ tex ┆ tst ┆ sta ┆ dex ┆ bil ┆ abi ┆ _te ┆ orm ┆ ext ┆ ity ┆ tex ┆ xts ┆ │\n", - "│ ┆ ┆ at ┆ tst ┆ at ┆ t ┆ _te ┆ ity ┆ lit ┆ xts ┆ ula ┆ sta ┆ _te ┆ tst ┆ tat ┆ │\n", - "│ ┆ ┆ --- ┆ at ┆ --- ┆ --- ┆ xts ┆ _in ┆ y_s ┆ tat ┆ _te ┆ t ┆ xts ┆ at ┆ --- ┆ │\n", - "│ ┆ ┆ f64 ┆ --- ┆ f64 ┆ f64 ┆ tat ┆ dex ┆ cor ┆ --- ┆ xts ┆ --- ┆ tat ┆ --- ┆ f64 ┆ │\n", - "│ ┆ ┆ ┆ f64 ┆ ┆ ┆ --- ┆ _te ┆ e_t ┆ i64 ┆ tat ┆ f64 ┆ --- ┆ f64 ┆ ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ xts ┆ ext ┆ ┆ --- ┆ ┆ f64 ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ tat ┆ sta ┆ ┆ f64 ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- ┆ t ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ --- ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "╞═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪════════╡\n", - "│ Six ┆
\n", + " | text | \n", + "id | \n", + "dump | \n", + "url | \n", + "date | \n", + "file_path | \n", + "language | \n", + "language_score | \n", + "token_count | \n", + "
---|---|---|---|---|---|---|---|---|---|
0 | \n", + "How AP reported in all formats from tornado-st... | \n", + "<urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff> | \n", + "CC-MAIN-2013-20 | \n", + "http://%20jwashington@ap.org/Content/Press-Rel... | \n", + "2013-05-18T05:48:54Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.972142 | \n", + "717 | \n", + "
1 | \n", + "Did you know you have two little yellow, nine-... | \n", + "<urn:uuid:803e14c3-dc2e-43d6-b75d-6fb3981c4fe6> | \n", + "CC-MAIN-2013-20 | \n", + "http://1000awesomethings.com/2012/09/24/934-ad... | \n", + "2013-05-18T08:11:45Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.947991 | \n", + "821 | \n", + "
2 | \n", + "Car Wash For Clara!\\nNow is your chance to hel... | \n", + "<urn:uuid:ac1bbfff-9519-4967-9c64-3dc3a4b471ec> | \n", + "CC-MAIN-2013-20 | \n", + "http://1027kord.com/car-wash-for-clara/ | \n", + "2013-05-18T06:49:55Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.911518 | \n", + "125 | \n", + "
3 | \n", + "Listeners Get Sky-high View of Missoula From H... | \n", + "<urn:uuid:c1445c58-b111-4c4e-badd-1e43ec317df7> | \n", + "CC-MAIN-2013-20 | \n", + "http://1075zoofm.com/listeners-get-sky-high-vi... | \n", + "2013-05-18T06:25:20Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.956516 | \n", + "103 | \n", + "
4 | \n", + "Log In Please enter your ECode to log in.\\nFor... | \n", + "<urn:uuid:e5829f7d-b944-4468-9573-61b7cb3078cc> | \n", + "CC-MAIN-2013-20 | \n", + "http://1105govinfoevents.com/enterprisearchite... | \n", + "2013-05-18T05:27:01Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.798235 | \n", + "75 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
1091391 | \n", + "PALMS — The winner of a $7 million SuperLotto ... | \n", + "<urn:uuid:9a5989f7-b385-498f-84de-75abc9272805> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scpr.org/news/2010/06/06/15880/7m-s... | \n", + "2013-05-22T08:33:55Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.971524 | \n", + "165 | \n", + "
1091392 | \n", + "Irfan Khan/AFP/Getty Images\\nFormer Bell City ... | \n", + "<urn:uuid:b49419dd-bc94-4302-a097-6c544fa0631e> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scpr.org/news/2011/03/15/24996/atto... | \n", + "2013-05-22T07:56:02Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.973813 | \n", + "313 | \n", + "
1091393 | \n", + "A more common sentiment than you would think (... | \n", + "<urn:uuid:832b678a-df73-4131-b479-b9fbd3370a6f> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... | \n", + "2013-05-22T07:55:36Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.969990 | \n", + "217 | \n", + "
1091394 | \n", + "Paper Fashions Boutique is here to save you ti... | \n", + "<urn:uuid:1c61271c-9694-4481-aef2-117fea466605> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scrapscene.com/2010/08/new-scrapboo... | \n", + "2013-05-22T08:27:53Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.963822 | \n", + "659 | \n", + "
1091395 | \n", + "Admissions down in Argentina by 7% in first ha... | \n", + "<urn:uuid:8759fd30-1bf9-4538-83d1-1195e0d08f93> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.screendaily.com/admissions-down-in-... | \n", + "2013-05-22T08:13:50Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.925611 | \n", + "252 | \n", + "
1091396 rows × 9 columns
\n", + "\n", + " | text | \n", + "id | \n", + "dump | \n", + "url | \n", + "date | \n", + "file_path | \n", + "language | \n", + "language_score | \n", + "token_count | \n", + "
---|---|---|---|---|---|---|---|---|---|
0 | \n", + "How AP reported in all formats from tornado-st... | \n", + "<urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff> | \n", + "CC-MAIN-2013-20 | \n", + "http://%20jwashington@ap.org/Content/Press-Rel... | \n", + "2013-05-18T05:48:54Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.972142 | \n", + "717 | \n", + "
1 | \n", + "Did you know you have two little yellow, nine-... | \n", + "<urn:uuid:803e14c3-dc2e-43d6-b75d-6fb3981c4fe6> | \n", + "CC-MAIN-2013-20 | \n", + "http://1000awesomethings.com/2012/09/24/934-ad... | \n", + "2013-05-18T08:11:45Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.947991 | \n", + "821 | \n", + "
2 | \n", + "Car Wash For Clara!\\nNow is your chance to hel... | \n", + "<urn:uuid:ac1bbfff-9519-4967-9c64-3dc3a4b471ec> | \n", + "CC-MAIN-2013-20 | \n", + "http://1027kord.com/car-wash-for-clara/ | \n", + "2013-05-18T06:49:55Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.911518 | \n", + "125 | \n", + "
3 | \n", + "Listeners Get Sky-high View of Missoula From H... | \n", + "<urn:uuid:c1445c58-b111-4c4e-badd-1e43ec317df7> | \n", + "CC-MAIN-2013-20 | \n", + "http://1075zoofm.com/listeners-get-sky-high-vi... | \n", + "2013-05-18T06:25:20Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.956516 | \n", + "103 | \n", + "
4 | \n", + "Log In Please enter your ECode to log in.\\nFor... | \n", + "<urn:uuid:e5829f7d-b944-4468-9573-61b7cb3078cc> | \n", + "CC-MAIN-2013-20 | \n", + "http://1105govinfoevents.com/enterprisearchite... | \n", + "2013-05-18T05:27:01Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.798235 | \n", + "75 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
1091391 | \n", + "PALMS — The winner of a $7 million SuperLotto ... | \n", + "<urn:uuid:9a5989f7-b385-498f-84de-75abc9272805> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scpr.org/news/2010/06/06/15880/7m-s... | \n", + "2013-05-22T08:33:55Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.971524 | \n", + "165 | \n", + "
1091392 | \n", + "Irfan Khan/AFP/Getty Images\\nFormer Bell City ... | \n", + "<urn:uuid:b49419dd-bc94-4302-a097-6c544fa0631e> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scpr.org/news/2011/03/15/24996/atto... | \n", + "2013-05-22T07:56:02Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.973813 | \n", + "313 | \n", + "
1091393 | \n", + "A more common sentiment than you would think (... | \n", + "<urn:uuid:832b678a-df73-4131-b479-b9fbd3370a6f> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scq.ubc.ca/sciencescouts/the-i%E2%8... | \n", + "2013-05-22T07:55:36Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.969990 | \n", + "217 | \n", + "
1091394 | \n", + "Paper Fashions Boutique is here to save you ti... | \n", + "<urn:uuid:1c61271c-9694-4481-aef2-117fea466605> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.scrapscene.com/2010/08/new-scrapboo... | \n", + "2013-05-22T08:27:53Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.963822 | \n", + "659 | \n", + "
1091395 | \n", + "Admissions down in Argentina by 7% in first ha... | \n", + "<urn:uuid:8759fd30-1bf9-4538-83d1-1195e0d08f93> | \n", + "CC-MAIN-2013-20 | \n", + "http://www.screendaily.com/admissions-down-in-... | \n", + "2013-05-22T08:13:50Z | \n", + "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/se... | \n", + "en | \n", + "0.925611 | \n", + "252 | \n", + "
1091396 rows × 9 columns
\n", + "