Merge pull request #1029 from jsvine/develop

Merge v0.10.3 into stable
jsvine · Oct 26, 2023 · d9561d1 · d9561d1
2 parents ceef47b + 2e838d1
commit d9561d1
Show file tree

Hide file tree

Showing 16 changed files with 184 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
+## [0.10.3] - 2023-10-26
+
+### Added
+
+- Add support for marked-content sequences, represented by `mcid` and `tag` attributes on `char`/`rect`/`line`/`curve`/`image` objects (h/t @dhdaines). ([#961](https://github.com/jsvine/pdfplumber/pulls/961))
+- Add `gs_path` argument to `pdfplumber.open(...)` and `pdfplumber.repair(...)`, to allow passing a custom Ghostscript path to be used for repairing. ([#953](https://github.com/jsvine/pdfplumber/issues/953))
+
+### Fixed
+
+- Respect `use_text_flow` in `extract_text` (h/t @dhdaines). ([#983](https://github.com/jsvine/pdfplumber/pulls/983))
+
 ## [0.10.2] - 2023-07-29
 
 ### Added

diff --git a/CITATION.cff b/CITATION.cff
@@ -1,8 +1,8 @@
 cff-version: 1.2.0
 title: pdfplumber
 type: software
-version: 0.10.2
-date-released: "2023-07-29"
+version: 0.10.3
+date-released: "2023-10-26"
 authors:
   - family-names: "Singer-Vine"
     given-names: "Jeremy"

diff --git a/README.md b/README.md
@@ -158,6 +158,8 @@ Each object is represented as a simple Python `dict`, with the following propert
 |`bottom`| Distance of bottom of the character from top of page.|
 |`doctop`| Distance of top of character from top of document.|
 |`matrix`| The "current transformation matrix" for this character. (See below for details.)|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this character if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this character if any (otherwise `None`). *Experimental attribute.*|
 |`ncs`|TKTK|
 |`stroking_pattern`|TKTK|
 |`non_stroking_pattern`|TKTK|
@@ -191,6 +193,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`linewidth`| Thickness of line.|
 |`stroking_color`|The color of the line. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The non-stroking color specified for the line’s path. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this line if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this line if any (otherwise `None`). *Experimental attribute.*|
 |`object_type`| "line"|
 
 #### `rect` properties
@@ -210,6 +214,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`linewidth`| Thickness of line.|
 |`stroking_color`|The color of the rectangle's outline. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The rectangle’s fill color. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this rect if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this rect if any (otherwise `None`). *Experimental attribute.*|
 |`object_type`| "rect"|
 
 #### `curve` properties
@@ -231,6 +237,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`fill`| Whether the shape defined by the curve's path is filled.|
 |`stroking_color`|The color of the curve's outline. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The curve’s fill color. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this curve if any (otherwise `None`). *Experimental attribute.*|
+|`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this curve if any (otherwise `None`). *Experimental attribute.*|
 |`object_type`| "curve"|
 
 #### Derived properties
@@ -531,6 +539,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [Shannon Shen](https://github.com/lolipopshock)
 - [Matsumoto Toshi](https://github.com/toshi1127)
 - [John West](https://github.com/jwestwsj)
+- [David Huggins-Daines](https://github.com/dhdaines)
 - [Jeremy B. Merrill](https://github.com/jeremybmerrill)
 
 ## Contributing

diff --git a/docs/repairing.md b/docs/repairing.md
@@ -9,3 +9,7 @@ Malformed PDFs can often be [fixed via Ghostscript](https://superuser.com/questi
 - `pdfplumber.open(..., repair=True)` will repair your PDF on the fly (but not save the repaired version to disk).
 - `pdfplumber.repair(path_to_pdf)` will return a `BytesIO` object holding the bytes of a repaired version of the original file.
 - `pdfplumber.repair(path_to_pdf, outfile="path/to/repaired.pdf")` will write a repaired version of the original file to the indicated `outfile` path.
+
+## Custom parameters
+
+- `gs_path=...`: You can pass a custom path for the Ghostscript executable, helpful in case `pdfplumber` is unable to auto-detect your copy of Ghostscript.
diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py
@@ -1,2 +1,2 @@
-version_info = (0, 10, 2)
+version_info = (0, 10, 3)
 __version__ = ".".join(map(str, version_info))
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -22,7 +22,7 @@
     LTPage,
     LTTextContainer,
 )
-from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
 from pdfminer.pdfpage import PDFPage
 from pdfminer.psparser import PSLiteral
 
@@ -62,6 +62,8 @@
         "stream",
         "stroke",
         "stroking_color",
+        "mcid",
+        "tag",
     ]
 )
 
@@ -115,6 +117,56 @@ def normalize_color(
     return separate_pattern(tuplefied)
 
 
+class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
+    """Extract layout from a specific page, adding marked-content IDs to
+    objects where found."""
+
+    cur_mcid: Optional[int] = None
+    cur_tag: Optional[str] = None
+
+    def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
+        """Handle beginning of tag, setting current MCID if any."""
+        self.cur_tag = decode_text(tag.name)
+        if isinstance(props, dict) and "MCID" in props:
+            self.cur_mcid = props["MCID"]
+        else:
+            self.cur_mcid = None
+
+    def end_tag(self) -> None:
+        """Handle beginning of tag, clearing current MCID."""
+        self.cur_tag = None
+        self.cur_mcid = None
+
+    def tag_cur_item(self) -> None:
+        """Add current MCID to what we hope to be the most recent object created
+        by pdfminer.six."""
+        # This is somewhat hacky and would not be necessary if
+        # pdfminer.six supported MCIDs.  In reading the code it's
+        # clear that the `render_*` methods methods will only ever
+        # create one object, but that is far from being guaranteed.
+        # Even if pdfminer.six's API would just return the objects it
+        # creates, we wouldn't have to do this.
+        cur_obj = self.cur_item._objs[-1]
+        cur_obj.mcid = self.cur_mcid  # type: ignore
+        cur_obj.tag = self.cur_tag  # type: ignore
+
+    def render_char(self, *args, **kwargs) -> float:  # type: ignore
+        """Hook for rendering characters, adding the `mcid` attribute."""
+        adv = super().render_char(*args, **kwargs)
+        self.tag_cur_item()
+        return adv
+
+    def render_image(self, *args, **kwargs) -> None:  # type: ignore
+        """Hook for rendering images, adding the `mcid` attribute."""
+        super().render_image(*args, **kwargs)
+        self.tag_cur_item()
+
+    def paint_path(self, *args, **kwargs) -> None:  # type: ignore
+        """Hook for rendering lines and curves, adding the `mcid` attribute."""
+        super().paint_path(*args, **kwargs)
+        self.tag_cur_item()
+
+
 class Page(Container):
     cached_properties: List[str] = Container.cached_properties + ["_layout"]
     is_original: bool = True
@@ -174,7 +226,7 @@ def height(self) -> T_num:
     def layout(self) -> LTPage:
         if hasattr(self, "_layout"):
             return self._layout
-        device = PDFPageAggregator(
+        device = PDFPageAggregatorWithMarkedContent(
             self.pdf.rsrcmgr,
             pageno=self.page_number,
             laparams=self.pdf.laparams,

diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py
@@ -70,12 +70,13 @@ def open(
         password: Optional[str] = None,
         strict_metadata: bool = False,
         repair: bool = False,
+        gs_path: Optional[Union[str, pathlib.Path]] = None,
     ) -> "PDF":
 
         stream: Union[BufferedReader, BytesIO]
 
         if repair:
-            stream = _repair(path_or_fp, password=password)
+            stream = _repair(path_or_fp, password=password, gs_path=gs_path)
             stream_is_external = False
             # Although the original file has a path,
             # the repaired version does not

diff --git a/pdfplumber/repair.py b/pdfplumber/repair.py
@@ -8,9 +8,10 @@
 def _repair(
     path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
     password: Optional[str] = None,
+    gs_path: Optional[Union[str, pathlib.Path]] = None,
 ) -> BytesIO:
 
-    executable = shutil.which("gs") or shutil.which("gswin32c")
+    executable = gs_path or shutil.which("gs") or shutil.which("gswin32c")
     if executable is None:  # pragma: nocover
         raise Exception(
             "Cannot find Ghostscript, which is required for repairs.\n"
@@ -52,8 +53,9 @@ def repair(
     path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
     outfile: Optional[Union[str, pathlib.Path]] = None,
     password: Optional[str] = None,
+    gs_path: Optional[Union[str, pathlib.Path]] = None,
 ) -> Optional[BytesIO]:
-    repaired = _repair(path_or_fp, password)
+    repaired = _repair(path_or_fp, password, gs_path=gs_path)
     if outfile:
         with open(outfile, "wb") as f:
             f.write(repaired.read())

diff --git a/pdfplumber/utils/clustering.py b/pdfplumber/utils/clustering.py
@@ -40,7 +40,10 @@ def make_cluster_dict(values: Iterable[T_num], tolerance: T_num) -> Dict[T_num,
 
 
 def cluster_objects(
-    xs: List[R], key_fn: Union[Hashable, Callable[[R], T_num]], tolerance: T_num
+    xs: List[R],
+    key_fn: Union[Hashable, Callable[[R], T_num]],
+    tolerance: T_num,
+    preserve_order: bool = False,
 ) -> List[List[R]]:
 
     if not callable(key_fn):
@@ -51,7 +54,12 @@ def cluster_objects(
 
     get_0, get_1 = itemgetter(0), itemgetter(1)
 
-    cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
+    if preserve_order:
+        cluster_tuples = [(x, cluster_dict.get(key_fn(x))) for x in xs]
+    else:
+        cluster_tuples = sorted(
+            ((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1
+        )
 
     grouped = itertools.groupby(cluster_tuples, key=get_1)
 

diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py
@@ -225,7 +225,10 @@ def to_textmap(
 
         for i, ws in enumerate(
             cluster_objects(
-                words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
+                words_sorted_doctop,
+                lambda x: float(x[0]["doctop"]),
+                y_tolerance,
+                preserve_order=presorted or use_text_flow,
             )
         ):
             y_dist = (

diff --git a/tests/pdfs/issue-982-example.pdf b/tests/pdfs/issue-982-example.pdf
diff --git a/tests/pdfs/mcid_example.pdf b/tests/pdfs/mcid_example.pdf
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -70,7 +70,7 @@ def test_csv(self):
         assert c.split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
-            ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+            ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
         )
 
         io = StringIO()
@@ -125,7 +125,7 @@ def test_cli_csv(self):
         assert res.decode("utf-8").split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
-            ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+            ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
         )
 
     def test_cli_csv_exclude(self):
@@ -141,6 +141,7 @@ def test_cli_csv_exclude(self):
                 "3",
                 "--exclude-attrs",
                 "matrix",
+                "mcid",
                 "ncs",
                 "non_stroking_pattern",
                 "stroking_pattern",
@@ -150,7 +151,7 @@ def test_cli_csv_exclude(self):
         assert res.decode("utf-8").split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             "18.0,12.996,,,,,,TimesNewRomanPSMT,"
-            ',,"(0, 0, 0)",,18.0,,,,,Y,,1,'
+            ',,"(0, 0, 0)",,18.0,,,,,,Y,,1,'
         )
 
     def test_cli_csv_include(self):

diff --git a/tests/test_issues.py b/tests/test_issues.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import logging
 import os
+import re
 import unittest
 
 import pdfplumber
@@ -257,3 +258,20 @@ def test_issue_683(self):
         with pdfplumber.open(path) as pdf:
             page = pdf.pages[0]
             page.search(r"\d+", regex=True)
+
+    def test_issue_982(self):
+        """
+        extract_text(use_text_flow=True) apparently does nothing
+
+        This is because, while we took care not to sort the words by
+        `doctop` in `WordExtractor` and `WordMap`, no such precaution
+        was taken in `cluster_objects`.  We thus add an option to
+        `cluster_objects` to preserve the ordering (which could come
+        from `use_text_flow` or from `presorted`) of the input objects.
+        """
+        path = os.path.join(HERE, "pdfs/issue-982-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            page = pdf.pages[0]
+            text = re.sub(r"\s+", " ", page.extract_text(use_text_flow=True))
+            words = " ".join(w["text"] for w in page.extract_words(use_text_flow=True))
+            assert text[0:100] == words[0:100]
diff --git a/tests/test_mcids.py b/tests/test_mcids.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+
+import pdfplumber
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+
+class TestMCIDs(unittest.TestCase):
+    """Test MCID extraction."""
+
+    def test_mcids(self):
+        path = os.path.join(HERE, "pdfs/mcid_example.pdf")
+
+        pdf = pdfplumber.open(path)
+        page = pdf.pages[0]
+        # Check text of MCIDS
+        mcids = []
+        for c in page.chars:
+            if "mcid" in c:
+                while len(mcids) <= c["mcid"]:
+                    mcids.append("")
+                if not mcids[c["mcid"]]:
+                    mcids[c["mcid"]] = c["tag"] + ": "
+                mcids[c["mcid"]] += c["text"]
+        assert mcids == [
+            "Standard: Test of figures",
+            "",
+            "P: 1 ligne",
+            "P: 2 ligne",
+            "P: 3 ligne",
+            "P: 4 ligne",
+            "P: 0",
+            "P: 2",
+            "P: 4",
+            "P: 6",
+            "P: 8",
+            "P: 10",
+            "P: 12",
+            "P: Figure 1: Chart",
+            "",
+            "P: 1 colonne",
+            "P: 2 colonne",
+            "P: 3 colonne",
+        ]
+        # Check line and curve MCIDs
+        line_mcids = set(x["mcid"] for x in page.lines)
+        curve_mcids = set(x["mcid"] for x in page.curves)
+        assert all(x["tag"] == "Figure" for x in page.lines)
+        assert all(x["tag"] == "Figure" for x in page.curves)
+        assert line_mcids & {1, 14}
+        assert curve_mcids & {1, 14}
+        # No rects to test unfortunately!
diff --git a/tests/test_repair.py b/tests/test_repair.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os
+import shutil
 import tempfile
 import unittest
 
@@ -56,3 +57,8 @@ def test_repair_password(self):
         path = os.path.join(HERE, "pdfs/password-example.pdf")
         with pdfplumber.open(path, repair=True, password="test") as pdf:
             assert len(pdf.pages[0].chars)
+
+    def test_repair_custom_path(self):
+        path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf")
+        with pdfplumber.open(path, repair=True, gs_path=shutil.which("gs")) as pdf:
+            assert len(pdf.pages[0].chars)