Skip to content

Commit dfb1653

Browse files
Enable vertical text detection for rotated images (#4328)
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: vladimir-kivi-ds <vladimir-kivi-ds@users.noreply.github.com>
1 parent d0aa8eb commit dfb1653

13 files changed

Lines changed: 3179 additions & 2936 deletions

File tree

CHANGELOG.md

Lines changed: 1187 additions & 1072 deletions
Large diffs are not rendered by default.

example-docs/rotated-page-90.pdf

1.08 KB
Binary file not shown.

pyproject.toml

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ dependencies = [
4949
[project.optional-dependencies]
5050
# Document type extras
5151
csv = [
52-
"pandas>=2.0.0, <4.0.0",
52+
"pandas>=2.0.0, <3.0.0",
5353
]
5454
doc = [
5555
"unstructured[docx]",
@@ -68,9 +68,9 @@ image = [
6868
"pi-heif>=1.2.0, <2.0.0",
6969
"pikepdf>=10.3.0, <11.0.0",
7070
"pypdf>=6.6.2, <7.0.0",
71-
"unstructured-inference>=1.6.2, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
71+
"unstructured-inference>=1.6.6, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
7272
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
73-
"unstructured-inference>=1.6.2, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
73+
"unstructured-inference>=1.6.6, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
7474
"unstructured-pytesseract>=0.3.15, <1.0.0",
7575
]
7676
md = [
@@ -109,7 +109,7 @@ xlsx = [
109109
"msoffcrypto-tool>=6.0.0, <7.0.0",
110110
"networkx>=3.2.0, <4.0.0",
111111
"openpyxl>=3.1.5, <4.0.0",
112-
"pandas>=2.0.0, <4.0.0",
112+
"pandas>=2.0.0, <3.0.0",
113113
"xlrd>=2.0.1, <3.0.0",
114114
]
115115
# Speech-to-text for partition_audio (multimodal: audio -> elements)
@@ -195,14 +195,9 @@ required-environments = [
195195
"sys_platform == 'darwin' and platform_machine == 'arm64'",
196196
"sys_platform == 'win32'",
197197
]
198-
override-dependencies = [
199-
# unstructured-inference 1.6.2 has unnecessarily aggressive numpy/pandas floors
200-
# that conflict with kdbai-client (via pykx). The inference codebase only uses
201-
# basic APIs available since numpy 1.26 / pandas 1.5.
202-
"numpy>=1.26.0",
203-
"pandas>=1.5.0",
204-
]
205198
constraint-dependencies = [
199+
# Temporary pin for Azure public-container ingest regression in adlfs 2026.4.0 stack
200+
"adlfs==2026.2.0",
206201
# deltalake 1.3.0 is missing Linux ARM64 wheels, causing Docker ARM64 builds to fail
207202
"deltalake<1.3.0",
208203
"fonttools>=4.60.2",

scripts/check-licenses.sh

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,14 @@ Python-2.0"
3030
# upstream source repository.
3131
IGNORED_PACKAGES=(
3232
# Metadata missing -- verified permissive on GitHub
33-
arro3-core # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
34-
chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
35-
google-crc32c # Apache-2.0 (googleapis/python-crc32c)
36-
iopath # MIT (facebookresearch/iopath)
37-
pypdfium2 # BSD-3-Clause (PDFium/PDFium)
38-
sentencepiece # Apache-2.0 (google/sentencepiece)
39-
voyageai # MIT (voyage-ai/voyageai-python)
33+
arro3-core # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
34+
chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
35+
google-crc32c # Apache-2.0 (googleapis/python-crc32c)
36+
iopath # MIT (facebookresearch/iopath)
37+
pypdfium2 # BSD-3-Clause (PDFium/PDFium)
38+
sentencepiece # Apache-2.0 (google/sentencepiece)
39+
voyageai # MIT (voyage-ai/voyageai-python)
40+
matplotlib-inline # BSD 3-Clause (ipython/matplotlib-inline)
4041

4142
# Permissive but non-standard classifier
4243
lmdb # OpenLDAP Public License (BSD-style, jnwatson/py-lmdb)

test_unstructured/partition/pdf_image/test_image.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class MockPageLayout(layout.PageLayout):
6969
def __init__(self, number: int, image: Image):
7070
self.number = number
7171
self.image = image
72+
self.image_metadata = {"pdf_rotation": 0}
7273
self.elements = [
7374
layout.LayoutElement.from_coords(
7475
type="Title",

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,3 +1824,20 @@ def test_reproductible_pdf_loader():
18241824
assert e1.text == e2.text, f"load two time {f=} return differents results"
18251825
else:
18261826
break
1827+
1828+
1829+
def test_hi_res_groups_rotated_page_text_into_words():
1830+
elements = pdf.partition_pdf(
1831+
filename=example_doc_path("rotated-page-90.pdf"),
1832+
strategy=PartitionStrategy.HI_RES,
1833+
)
1834+
1835+
texts = [e.text for e in elements if e.text and len(e.text) > 5]
1836+
assert any("Hello World" in t for t in texts), (
1837+
f"Expected 'Hello World' as grouped text from rotated page, got: {texts[:5]}"
1838+
)
1839+
1840+
single_chars = [e.text for e in elements if e.text and len(e.text) == 1]
1841+
assert len(single_chars) == 0, (
1842+
f"Rotated page produced {len(single_chars)} single-char elements: {single_chars[:10]}"
1843+
)

test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
<h1 class="Title" id="d3be9e3d661e2a79f37257caa5b54d8c">
4444
LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis
4545
</h1>
46-
<p class="NarrativeText" id="4dfee7e352ae892814e46bb220094b0f">
47-
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li&gt;
46+
<p class="NarrativeText" id="607ee712429ac9cf3540dbdc5e55e143">
47+
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®
4848
</p>
4949
<p class="NarrativeText" id="23b8def20ce16f929d4f558b2a19f200">
5050
1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca
@@ -561,8 +561,8 @@ <h1 class="Title" id="54ee49eac3f4e6098811cda1f9dd0306">
561561
<li class="ListItem" id="184a3abfd34e7aa04632979ee3c2de36">
562562
17 This measures the number of edits from the ground-truth text to the predicted text, and lower is better.
563563
</li>
564-
<li class="ListItem" id="2b7101f39954d5301166b82906202ea9">
565-
LayoutParser: A Unified Toolkit for DL-Based DIA
564+
<li class="ListItem" id="f1b03448874d9c98a0a59a20b134c513">
565+
LayoutParser: A Unified Toolkit for DL-Based DIA 13
566566
</li>
567567
<img alt="ra (a) Partial table at the bottom (b) Full page table (c) Partial table at the top (d) Mis-detected text line" class="Image" id="d7ab3da5ec0adb1b2b4fb5f800a545a0"/>
568568
<p class="FigureCaption" id="d35d253341e8b8d837f384ecd6ac410a">

test_unstructured_ingest/expected-structured-output-markdown/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ X
1010
r
1111
a
1212
# LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis
13-
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li>
13+
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®
1414
1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca
1515
Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.
1616
Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.
@@ -120,7 +120,7 @@ Additionally, it is common for historical documents to use unique fonts with di
120120
Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR.
121121
16 This measures the overlap between the detected and ground-truth characters, and the maximum is 1.
122122
17 This measures the number of edits from the ground-truth text to the predicted text, and lower is better.
123-
LayoutParser: A Unified Toolkit for DL-Based DIA
123+
LayoutParser: A Unified Toolkit for DL-Based DIA 13
124124
ra (a) Partial table at the bottom (b) Full page table (c) Partial table at the top (d) Mis-detected text line
125125
Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in different locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.
126126
5.2 A light-weight Visual Table Extractor

test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,8 @@
267267
},
268268
{
269269
"type": "NarrativeText",
270-
"element_id": "4dfee7e352ae892814e46bb220094b0f",
271-
"text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li>",
270+
"element_id": "607ee712429ac9cf3540dbdc5e55e143",
271+
"text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®",
272272
"metadata": {
273273
"is_extracted": "true",
274274
"filetype": "application/pdf",
@@ -3292,8 +3292,8 @@
32923292
},
32933293
{
32943294
"type": "ListItem",
3295-
"element_id": "2b7101f39954d5301166b82906202ea9",
3296-
"text": "LayoutParser: A Unified Toolkit for DL-Based DIA",
3295+
"element_id": "f1b03448874d9c98a0a59a20b134c513",
3296+
"text": "LayoutParser: A Unified Toolkit for DL-Based DIA 13",
32973297
"metadata": {
32983298
"is_extracted": "true",
32993299
"filetype": "application/pdf",

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.19" # pragma: no cover
1+
__version__ = "0.22.20" # pragma: no cover

0 commit comments

Comments
 (0)