Add Table.columns, analogous to Table.rows (#1050)

h/t @Pk13055 for the suggestion
jsvine · Aug 2, 2024 · d39302f · d39302f
1 parent 48cab3f
commit d39302f
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format
 
 ## [0.11.3] - Unreleased
 
+### Added
+
+- Add `Table.columns`, analogous to `Table.rows` (h/t @Pk13055). ([#1050](https://github.com/jsvine/pdfplumber/issues/1050))
+
 ### Changed
 
 - Change default setting `pdfplumber.repair(...)` passes to Ghostscript's `-dPDFSETTINGS` parameter, from `prepress` to `default`, and make that setting modifiable via `.repair(setting=...)`, where the value is one of `"default"`, `"prepress"`, `"printer"`, or `"ebook"` (h/t @Laubeee). ([#874](https://github.com/jsvine/pdfplumber/issues/874))

diff --git a/README.md b/README.md
@@ -368,7 +368,7 @@ Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillo
 
 | Method | Description |
 |--------|-------------|
-|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.|
+|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, `.columns`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.|
 |`.find_table(table_settings={})`|Similar to `.find_tables(...)`, but returns the *largest* table on the page, as a `Table` object. If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.|
 |`.extract_tables(table_settings={})`|Returns the text extracted from *all* tables found on the page, represented as a list of lists of lists, with the structure `table -> row -> cell`.|
 |`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page (see `.find_table(...)` above), represented as a list of lists, with the structure `row -> cell`.|

diff --git a/pdfplumber/table.py b/pdfplumber/table.py
@@ -370,6 +370,10 @@ class Row(CellGroup):
     pass
 
 
+class Column(CellGroup):
+    pass
+
+
 class Table(object):
     def __init__(self, page: "Page", cells: List[T_bbox]):
         self.page = page
@@ -385,17 +389,35 @@ def bbox(self) -> T_bbox:
             max(map(itemgetter(3), c)),
         )
 
-    @property
-    def rows(self) -> List[Row]:
-        _sorted = sorted(self.cells, key=itemgetter(1, 0))
-        xs = list(sorted(set(map(itemgetter(0), self.cells))))
+    def _get_rows_or_cols(self, kind: type[CellGroup]) -> List[CellGroup]:
+        axis = 0 if kind is Row else 1
+        antiaxis = int(not axis)
+
+        # Sort first by top/x0, then by x0/top
+        _sorted = sorted(self.cells, key=itemgetter(antiaxis, axis))
+
+        # Sort get all x0s/tops
+        xs = list(sorted(set(map(itemgetter(axis), self.cells))))
+
+        # Group by top/x0
+        grouped = itertools.groupby(_sorted, itemgetter(antiaxis))
+
         rows = []
-        for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
-            xdict = {cell[0]: cell for cell in row_cells}
-            row = Row([xdict.get(x) for x in xs])
+        # for y/x, row/column-cells ...
+        for y, row_cells in grouped:
+            xdict = {cell[axis]: cell for cell in row_cells}
+            row = kind([xdict.get(x) for x in xs])
             rows.append(row)
         return rows
 
+    @property
+    def rows(self) -> List[CellGroup]:
+        return self._get_rows_or_cols(Row)
+
+    @property
+    def columns(self) -> List[CellGroup]:
+        return self._get_rows_or_cols(Column)
+
     def extract(self, **kwargs: Any) -> List[List[Optional[str]]]:
 
         chars = self.page.chars

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -73,6 +73,32 @@ def test_edges_strict(self):
             "",
         ]
 
+    def test_rows_and_columns(self):
+        path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            page = pdf.pages[0]
+            table = page.find_table()
+            row = [page.crop(bbox).extract_text() for bbox in table.rows[0].cells]
+            assert row == [
+                "Line no",
+                "UPC code",
+                "Location",
+                "Item Description",
+                "Item Quantity",
+                "Bill Amount",
+                "Accrued Amount",
+                "Handling Rate",
+                "PO number",
+            ]
+            col = [page.crop(bbox).extract_text() for bbox in table.columns[1].cells]
+            assert col == [
+                "UPC code",
+                "0085648100305",
+                "0085648100380",
+                "0085648100303",
+                "0085648100300",
+            ]
+
     def test_explicit_desc_decimalization(self):
         """
         See issue #290