Enable vertical text detection for rotated images (#4328)

vladimir-kivi-ds · ryannikolaidis · web-flow · commit dfb1653f7f0c · 2026-04-14T00:55:02.000Z
Co-authored-by: ryannikolaidis &lt;1208590+ryannikolaidis@users.noreply.github.com&gt;
Co-authored-by: vladimir-kivi-ds &lt;vladimir-kivi-ds@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/example-docs/rotated-page-90.pdf b/example-docs/rotated-page-90.pdf
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = [
 [project.optional-dependencies]
 # Document type extras
 csv = [
-    "pandas>=2.0.0, <4.0.0",
+    "pandas>=2.0.0, <3.0.0",
 ]
 doc = [
     "unstructured[docx]",
@@ -68,9 +68,9 @@ image = [
     "pi-heif>=1.2.0, <2.0.0",
     "pikepdf>=10.3.0, <11.0.0",
     "pypdf>=6.6.2, <7.0.0",
-    "unstructured-inference>=1.6.2, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
+    "unstructured-inference>=1.6.6, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
     "unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
-    "unstructured-inference>=1.6.2, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
+    "unstructured-inference>=1.6.6, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
     "unstructured-pytesseract>=0.3.15, <1.0.0",
 ]
 md = [
@@ -109,7 +109,7 @@ xlsx = [
     "msoffcrypto-tool>=6.0.0, <7.0.0",
     "networkx>=3.2.0, <4.0.0",
     "openpyxl>=3.1.5, <4.0.0",
-    "pandas>=2.0.0, <4.0.0",
+    "pandas>=2.0.0, <3.0.0",
     "xlrd>=2.0.1, <3.0.0",
 ]
 # Speech-to-text for partition_audio (multimodal: audio -> elements)
@@ -195,14 +195,9 @@ required-environments = [
     "sys_platform == 'darwin' and platform_machine == 'arm64'",
     "sys_platform == 'win32'",
 ]
-override-dependencies = [
-    # unstructured-inference 1.6.2 has unnecessarily aggressive numpy/pandas floors
-    # that conflict with kdbai-client (via pykx). The inference codebase only uses
-    # basic APIs available since numpy 1.26 / pandas 1.5.
-    "numpy>=1.26.0",
-    "pandas>=1.5.0",
-]
 constraint-dependencies = [
+    # Temporary pin for Azure public-container ingest regression in adlfs 2026.4.0 stack
+    "adlfs==2026.2.0",
     # deltalake 1.3.0 is missing Linux ARM64 wheels, causing Docker ARM64 builds to fail
     "deltalake<1.3.0",
     "fonttools>=4.60.2",
diff --git a/scripts/check-licenses.sh b/scripts/check-licenses.sh
@@ -30,13 +30,14 @@ Python-2.0"
 # upstream source repository.
 IGNORED_PACKAGES=(
   # Metadata missing -- verified permissive on GitHub
-  arro3-core     # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
-  chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
-  google-crc32c  # Apache-2.0 (googleapis/python-crc32c)
-  iopath         # MIT (facebookresearch/iopath)
-  pypdfium2      # BSD-3-Clause (PDFium/PDFium)
-  sentencepiece  # Apache-2.0 (google/sentencepiece)
-  voyageai       # MIT (voyage-ai/voyageai-python)
+  arro3-core        # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
+  chroma-hnswlib    # Apache-2.0 (chroma-core/hnswlib)
+  google-crc32c     # Apache-2.0 (googleapis/python-crc32c)
+  iopath            # MIT (facebookresearch/iopath)
+  pypdfium2         # BSD-3-Clause (PDFium/PDFium)
+  sentencepiece     # Apache-2.0 (google/sentencepiece)
+  voyageai          # MIT (voyage-ai/voyageai-python)
+  matplotlib-inline # BSD 3-Clause (ipython/matplotlib-inline)
 
   # Permissive but non-standard classifier
   lmdb # OpenLDAP Public License (BSD-style, jnwatson/py-lmdb)
diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py
@@ -69,6 +69,7 @@ class MockPageLayout(layout.PageLayout):
     def __init__(self, number: int, image: Image):
         self.number = number
         self.image = image
+        self.image_metadata = {"pdf_rotation": 0}
         self.elements = [
             layout.LayoutElement.from_coords(
                 type="Title",
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1824,3 +1824,20 @@ def test_reproductible_pdf_loader():
                 assert e1.text == e2.text, f"load two time {f=} return differents results"
             else:
                 break
+
+
+def test_hi_res_groups_rotated_page_text_into_words():
+    elements = pdf.partition_pdf(
+        filename=example_doc_path("rotated-page-90.pdf"),
+        strategy=PartitionStrategy.HI_RES,
+    )
+
+    texts = [e.text for e in elements if e.text and len(e.text) > 5]
+    assert any("Hello World" in t for t in texts), (
+        f"Expected 'Hello World' as grouped text from rotated page, got: {texts[:5]}"
+    )
+
+    single_chars = [e.text for e in elements if e.text and len(e.text) == 1]
+    assert len(single_chars) == 0, (
+        f"Rotated page produced {len(single_chars)} single-char elements: {single_chars[:10]}"
+    )
diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html
@@ -43,8 +43,8 @@
   <h1 class="Title" id="d3be9e3d661e2a79f37257caa5b54d8c">
    LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis
   </h1>
-  <p class="NarrativeText" id="4dfee7e352ae892814e46bb220094b0f">
-   Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li&gt;
+  <p class="NarrativeText" id="607ee712429ac9cf3540dbdc5e55e143">
+   Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®
   </p>
   <p class="NarrativeText" id="23b8def20ce16f929d4f558b2a19f200">
    1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca
@@ -561,8 +561,8 @@ <h1 class="Title" id="54ee49eac3f4e6098811cda1f9dd0306">
   <li class="ListItem" id="184a3abfd34e7aa04632979ee3c2de36">
    17 This measures the number of edits from the ground-truth text to the predicted text, and lower is better.
   </li>
-  <li class="ListItem" id="2b7101f39954d5301166b82906202ea9">
-   LayoutParser: A Uniﬁed Toolkit for DL-Based DIA
+  <li class="ListItem" id="f1b03448874d9c98a0a59a20b134c513">
+   LayoutParser: A Uniﬁed Toolkit for DL-Based DIA 13
   </li>
   <img alt="ra (a) Partial table at the bottom (b) Full page table (c) Partial table at the top (d) Mis-detected text line" class="Image" id="d7ab3da5ec0adb1b2b4fb5f800a545a0"/>
   <p class="FigureCaption" id="d35d253341e8b8d837f384ecd6ac410a">
diff --git a/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.md b/test_unstructured_ingest/expected-structured-output-markdown/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.md
@@ -10,7 +10,7 @@ X
 r
 a
 # LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis
-Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li>
+Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®
 1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca
 Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model conﬁgurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going eﬀorts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.
 Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.
@@ -120,7 +120,7 @@ Additionally, it is common for historical documents to use unique fonts with di
 Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate ﬁne-grained results that enable creative approaches like page reorganization for OCR.
 16 This measures the overlap between the detected and ground-truth characters, and the maximum is 1.
 17 This measures the number of edits from the ground-truth text to the predicted text, and lower is better.
-LayoutParser: A Uniﬁed Toolkit for DL-Based DIA
+LayoutParser: A Uniﬁed Toolkit for DL-Based DIA 13
 ra (a) Partial table at the bottom (b) Full page table (c) Partial table at the top (d) Mis-detected text line
 Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in diﬀerent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.
 5.2 A light-weight Visual Table Extractor
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -267,8 +267,8 @@
   },
   {
     "type": "NarrativeText",
-    "element_id": "4dfee7e352ae892814e46bb220094b0f",
-    "text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li>",
+    "element_id": "607ee712429ac9cf3540dbdc5e55e143",
+    "text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®",
     "metadata": {
       "is_extracted": "true",
       "filetype": "application/pdf",
@@ -3292,8 +3292,8 @@
   },
   {
     "type": "ListItem",
-    "element_id": "2b7101f39954d5301166b82906202ea9",
-    "text": "LayoutParser: A Uniﬁed Toolkit for DL-Based DIA",
+    "element_id": "f1b03448874d9c98a0a59a20b134c513",
+    "text": "LayoutParser: A Uniﬁed Toolkit for DL-Based DIA 13",
     "metadata": {
       "is_extracted": "true",
       "filetype": "application/pdf",
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.19"  # pragma: no cover
+__version__ = "0.22.20"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -732,6 +732,18 @@ def is_pdf_too_complex(
     return False
 
 
+def _enable_detect_vertical_if_rotated(
+    inferred_document_layout,
+    pdfminer_config: Optional["PDFMinerConfig"],
+) -> Optional["PDFMinerConfig"]:
+    """Enable detect_vertical in pdfminer when the PDF has rotated pages."""
+    if any((p.image_metadata or {}).get("pdf_rotation", 0) for p in inferred_document_layout.pages):
+        pdfminer_config = pdfminer_config or PDFMinerConfig()
+        pdfminer_config.detect_vertical = True
+
+    return pdfminer_config
+
+
 @requires_dependencies("unstructured_inference")
 def _partition_pdf_or_image_local(
     filename: str = "",
@@ -815,6 +827,11 @@ def _partition_pdf_or_image_local(
             password=password,
         )
 
+        pdfminer_config = _enable_detect_vertical_if_rotated(
+            inferred_document_layout,
+            pdfminer_config,
+        )
+
         extracted_layout, layouts_links = (
             process_file_with_pdfminer(
                 filename=filename,
@@ -877,6 +894,11 @@ def _partition_pdf_or_image_local(
         if hasattr(file, "seek"):
             file.seek(0)
 
+        pdfminer_config = _enable_detect_vertical_if_rotated(
+            inferred_document_layout,
+            pdfminer_config,
+        )
+
         extracted_layout, layouts_links = (
             process_data_with_pdfminer(
                 file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -303,6 +303,7 @@ class PDFMinerConfig(BaseModel):
     word_margin: Optional[float] = None
     line_margin: Optional[float] = None
     char_margin: Optional[float] = None
+    detect_vertical: Optional[bool] = None
 
 
 def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None):
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.22.19" # pragma: no cover`
	`1`	`+__version__ = "0.22.20" # pragma: no cover`