From d39302fa2ce5976f92276f60d10c127167f94d26 Mon Sep 17 00:00:00 2001
From: Jeremy Singer-Vine <jsvine@gmail.com>
Date: Fri, 2 Aug 2024 19:00:23 -0400
Subject: [PATCH] Add Table.columns, analogous to Table.rows (#1050)

h/t @Pk13055 for the suggestion
---
 CHANGELOG.md        |  4 ++++
 README.md           |  2 +-
 pdfplumber/table.py | 36 +++++++++++++++++++++++++++++-------
 tests/test_table.py | 26 ++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 04068a03..6b8ea77e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format
 
 ## [0.11.3] - Unreleased
 
+### Added
+
+- Add `Table.columns`, analogous to `Table.rows` (h/t @Pk13055). ([#1050](https://github.com/jsvine/pdfplumber/issues/1050))
+
 ### Changed
 
 - Change default setting `pdfplumber.repair(...)` passes to Ghostscript's `-dPDFSETTINGS` parameter, from `prepress` to `default`, and make that setting modifiable via `.repair(setting=...)`, where the value is one of `"default"`, `"prepress"`, `"printer"`, or `"ebook"` (h/t @Laubeee). ([#874](https://github.com/jsvine/pdfplumber/issues/874))
diff --git a/README.md b/README.md
index 9a2727d7..904e5d03 100644
--- a/README.md
+++ b/README.md
@@ -368,7 +368,7 @@ Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillo
 
 | Method | Description |
 |--------|-------------|
-|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.|
+|`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, `.columns`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.|
 |`.find_table(table_settings={})`|Similar to `.find_tables(...)`, but returns the *largest* table on the page, as a `Table` object. If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.|
 |`.extract_tables(table_settings={})`|Returns the text extracted from *all* tables found on the page, represented as a list of lists of lists, with the structure `table -> row -> cell`.|
 |`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page (see `.find_table(...)` above), represented as a list of lists, with the structure `row -> cell`.|
diff --git a/pdfplumber/table.py b/pdfplumber/table.py
index 7bcffc63..7fae68e9 100644
--- a/pdfplumber/table.py
+++ b/pdfplumber/table.py
@@ -370,6 +370,10 @@ class Row(CellGroup):
     pass
 
 
+class Column(CellGroup):
+    pass
+
+
 class Table(object):
     def __init__(self, page: "Page", cells: List[T_bbox]):
         self.page = page
@@ -385,17 +389,35 @@ def bbox(self) -> T_bbox:
             max(map(itemgetter(3), c)),
         )
 
-    @property
-    def rows(self) -> List[Row]:
-        _sorted = sorted(self.cells, key=itemgetter(1, 0))
-        xs = list(sorted(set(map(itemgetter(0), self.cells))))
+    def _get_rows_or_cols(self, kind: type[CellGroup]) -> List[CellGroup]:
+        axis = 0 if kind is Row else 1
+        antiaxis = int(not axis)
+
+        # Sort first by top/x0, then by x0/top
+        _sorted = sorted(self.cells, key=itemgetter(antiaxis, axis))
+
+        # Sort get all x0s/tops
+        xs = list(sorted(set(map(itemgetter(axis), self.cells))))
+
+        # Group by top/x0
+        grouped = itertools.groupby(_sorted, itemgetter(antiaxis))
+
         rows = []
-        for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
-            xdict = {cell[0]: cell for cell in row_cells}
-            row = Row([xdict.get(x) for x in xs])
+        # for y/x, row/column-cells ...
+        for y, row_cells in grouped:
+            xdict = {cell[axis]: cell for cell in row_cells}
+            row = kind([xdict.get(x) for x in xs])
             rows.append(row)
         return rows
 
+    @property
+    def rows(self) -> List[CellGroup]:
+        return self._get_rows_or_cols(Row)
+
+    @property
+    def columns(self) -> List[CellGroup]:
+        return self._get_rows_or_cols(Column)
+
     def extract(self, **kwargs: Any) -> List[List[Optional[str]]]:
 
         chars = self.page.chars
diff --git a/tests/test_table.py b/tests/test_table.py
index c09ccbfb..cdc2e3f7 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -73,6 +73,32 @@ def test_edges_strict(self):
             "",
         ]
 
+    def test_rows_and_columns(self):
+        path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            page = pdf.pages[0]
+            table = page.find_table()
+            row = [page.crop(bbox).extract_text() for bbox in table.rows[0].cells]
+            assert row == [
+                "Line no",
+                "UPC code",
+                "Location",
+                "Item Description",
+                "Item Quantity",
+                "Bill Amount",
+                "Accrued Amount",
+                "Handling Rate",
+                "PO number",
+            ]
+            col = [page.crop(bbox).extract_text() for bbox in table.columns[1].cells]
+            assert col == [
+                "UPC code",
+                "0085648100305",
+                "0085648100380",
+                "0085648100303",
+                "0085648100300",
+            ]
+
     def test_explicit_desc_decimalization(self):
         """
         See issue #290