From d39302fa2ce5976f92276f60d10c127167f94d26 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Fri, 2 Aug 2024 19:00:23 -0400 Subject: [PATCH] Add Table.columns, analogous to Table.rows (#1050) h/t @Pk13055 for the suggestion --- CHANGELOG.md | 4 ++++ README.md | 2 +- pdfplumber/table.py | 36 +++++++++++++++++++++++++++++------- tests/test_table.py | 26 ++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04068a03..6b8ea77e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format ## [0.11.3] - Unreleased +### Added + +- Add `Table.columns`, analogous to `Table.rows` (h/t @Pk13055). ([#1050](https://github.com/jsvine/pdfplumber/issues/1050)) + ### Changed - Change default setting `pdfplumber.repair(...)` passes to Ghostscript's `-dPDFSETTINGS` parameter, from `prepress` to `default`, and make that setting modifiable via `.repair(setting=...)`, where the value is one of `"default"`, `"prepress"`, `"printer"`, or `"ebook"` (h/t @Laubeee). ([#874](https://github.com/jsvine/pdfplumber/issues/874)) diff --git a/README.md b/README.md index 9a2727d7..904e5d03 100644 --- a/README.md +++ b/README.md @@ -368,7 +368,7 @@ Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillo | Method | Description | |--------|-------------| -|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.| +|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, `.columns`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.| |`.find_table(table_settings={})`|Similar to `.find_tables(...)`, but returns the *largest* table on the page, as a `Table` object. If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.| |`.extract_tables(table_settings={})`|Returns the text extracted from *all* tables found on the page, represented as a list of lists of lists, with the structure `table -> row -> cell`.| |`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page (see `.find_table(...)` above), represented as a list of lists, with the structure `row -> cell`.| diff --git a/pdfplumber/table.py b/pdfplumber/table.py index 7bcffc63..7fae68e9 100644 --- a/pdfplumber/table.py +++ b/pdfplumber/table.py @@ -370,6 +370,10 @@ class Row(CellGroup): pass +class Column(CellGroup): + pass + + class Table(object): def __init__(self, page: "Page", cells: List[T_bbox]): self.page = page @@ -385,17 +389,35 @@ def bbox(self) -> T_bbox: max(map(itemgetter(3), c)), ) - @property - def rows(self) -> List[Row]: - _sorted = sorted(self.cells, key=itemgetter(1, 0)) - xs = list(sorted(set(map(itemgetter(0), self.cells)))) + def _get_rows_or_cols(self, kind: type[CellGroup]) -> List[CellGroup]: + axis = 0 if kind is Row else 1 + antiaxis = int(not axis) + + # Sort first by top/x0, then by x0/top + _sorted = sorted(self.cells, key=itemgetter(antiaxis, axis)) + + # Sort get all x0s/tops + xs = list(sorted(set(map(itemgetter(axis), self.cells)))) + + # Group by top/x0 + grouped = itertools.groupby(_sorted, itemgetter(antiaxis)) + rows = [] - for y, row_cells in itertools.groupby(_sorted, itemgetter(1)): - xdict = {cell[0]: cell for cell in row_cells} - row = Row([xdict.get(x) for x in xs]) + # for y/x, row/column-cells ... + for y, row_cells in grouped: + xdict = {cell[axis]: cell for cell in row_cells} + row = kind([xdict.get(x) for x in xs]) rows.append(row) return rows + @property + def rows(self) -> List[CellGroup]: + return self._get_rows_or_cols(Row) + + @property + def columns(self) -> List[CellGroup]: + return self._get_rows_or_cols(Column) + def extract(self, **kwargs: Any) -> List[List[Optional[str]]]: chars = self.page.chars diff --git a/tests/test_table.py b/tests/test_table.py index c09ccbfb..cdc2e3f7 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -73,6 +73,32 @@ def test_edges_strict(self): "", ] + def test_rows_and_columns(self): + path = os.path.join(HERE, "pdfs/issue-140-example.pdf") + with pdfplumber.open(path) as pdf: + page = pdf.pages[0] + table = page.find_table() + row = [page.crop(bbox).extract_text() for bbox in table.rows[0].cells] + assert row == [ + "Line no", + "UPC code", + "Location", + "Item Description", + "Item Quantity", + "Bill Amount", + "Accrued Amount", + "Handling Rate", + "PO number", + ] + col = [page.crop(bbox).extract_text() for bbox in table.columns[1].cells] + assert col == [ + "UPC code", + "0085648100305", + "0085648100380", + "0085648100303", + "0085648100300", + ] + def test_explicit_desc_decimalization(self): """ See issue #290