Skip to content

Commit

Permalink
Add pdfplumber.open(unicode_norm=...)
Browse files Browse the repository at this point in the history
Allows user to pre-normalize Unicode characters.

h/t @petermr + @agusluques in #905
  • Loading branch information
jsvine committed Aug 4, 2024
1 parent 22494e8 commit 03a477f
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ All notable changes to this project will be documented in this file. The format

- Add `Table.columns`, analogous to `Table.rows` (h/t @Pk13055). ([#1050](https://github.com/jsvine/pdfplumber/issues/1050))
- Add `Page.extract_words(return_chars=True)`, mirroring `Page.search(..., return_chars=True)`; if this argument is passed, each word dictionary will include an additional key-value pair: `"chars": [char_object, ...]` (h/t @cmdlineluser). ([#1173](https://github.com/jsvine/pdfplumber/issues/1173))
- Add `pdfplumber.open(unicode_norm="NFC"/"NFD"/"NFKC"/NFKD")`, where the values are the [four options for Unicode normalization](https://unicode.org/reports/tr15/#Normalization_Forms_Table) (h/t @petermr + @agusluques). ([#905](https://github.com/jsvine/pdfplumber/issues/905))

### Changed

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ To load a password-protected PDF, pass the `password` keyword argument, e.g., `p

To set layout analysis parameters to `pdfminer.six`'s layout engine, pass the `laparams` keyword argument, e.g., `pdfplumber.open("file.pdf", laparams = { "line_overlap": 0.7 })`.

To [pre-normalize Unicode text](https://unicode.org/reports/tr15/), pass `unicode_norm=...`, where `...` is one of the [four Unicode normalization forms](https://unicode.org/reports/tr15/#Normalization_Forms_Table): `"NFC"`, `"NFD"`, `"NFKC"`, or `"NFKD"`.

Invalid metadata values are treated as a warning by default. If that is not intended, pass `strict_metadata=True` to the `open` method and `pdfplumber.open` will raise an exception if it is unable to parse the metadata.

### The `pdfplumber.PDF` class
Expand Down
8 changes: 7 additions & 1 deletion pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Tuple,
Union,
)
from unicodedata import normalize as normalize_unicode

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
Expand Down Expand Up @@ -382,7 +383,12 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]:
attr[color_attr], attr[pattern_attr] = normalize_color(attr[color_attr])

if isinstance(obj, (LTChar, LTTextContainer)):
attr["text"] = obj.get_text()
text = obj.get_text()
attr["text"] = (
normalize_unicode(self.pdf.unicode_norm, text)
if self.pdf.unicode_norm is not None
else text
)

if isinstance(obj, LTChar):
# pdfminer.six (at least as of v20221105) does not
Expand Down
6 changes: 5 additions & 1 deletion pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
from io import BufferedReader, BytesIO
from types import TracebackType
from typing import Any, Dict, List, Optional, Tuple, Type, Union
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union

from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
Expand Down Expand Up @@ -34,13 +34,15 @@ def __init__(
laparams: Optional[Dict[str, Any]] = None,
password: Optional[str] = None,
strict_metadata: bool = False,
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
):
self.stream = stream
self.stream_is_external = stream_is_external
self.path = path
self.pages_to_parse = pages
self.laparams = None if laparams is None else LAParams(**laparams)
self.password = password
self.unicode_norm = unicode_norm

self.doc = PDFDocument(PDFParser(stream), password=password or "")
self.rsrcmgr = PDFResourceManager()
Expand Down Expand Up @@ -70,6 +72,7 @@ def open(
laparams: Optional[Dict[str, Any]] = None,
password: Optional[str] = None,
strict_metadata: bool = False,
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
repair: bool = False,
gs_path: Optional[Union[str, pathlib.Path]] = None,
repair_setting: T_repair_setting = "default",
Expand Down Expand Up @@ -102,6 +105,7 @@ def open(
laparams=laparams,
password=password,
strict_metadata=strict_metadata,
unicode_norm=unicode_norm,
stream_is_external=stream_is_external,
)

Expand Down
Binary file added tests/pdfs/issue-905.pdf
Binary file not shown.
13 changes: 13 additions & 0 deletions tests/test_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,19 @@ def test_password(self):
with pdfplumber.open(path, password="test") as pdf:
assert len(pdf.chars) > 0

def test_unicode_normalization(self):
path = os.path.join(HERE, "pdfs/issue-905.pdf")

with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
print(page.extract_text())
assert ord(page.chars[0]["text"]) == 894

with pdfplumber.open(path, unicode_norm="NFC") as pdf:
page = pdf.pages[0]
assert ord(page.chars[0]["text"]) == 59
assert page.extract_text() == ";;"

def test_colors(self):
rect = self.pdf.pages[0].rects[0]
assert rect["non_stroking_color"] == (0.8, 1, 1)
Expand Down

0 comments on commit 03a477f

Please sign in to comment.