Skip to content

Commit 0913b6d

Browse files
committed
Revert get_page_number arg back to pathlib.Path
1 parent 73739df commit 0913b6d

File tree

3 files changed

+5
-5
lines changed

3 files changed

+5
-5
lines changed

src/corppa/ocr/collate_txt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def collate_txt(
5555
txt_data = {}
5656
for filename in files:
5757
with (input_dir / filename).open(encoding="utf-8") as txtfile:
58-
txt_data[get_page_number(filename.name)] = txtfile.read()
58+
txt_data[get_page_number(filename)] = txtfile.read()
5959

6060
# ensure the parent directory exists
6161
output_file.parent.mkdir(exist_ok=True)

src/corppa/utils/path_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def get_image_relpath(work_id, page_num):
118118
raise ValueError(f"Unsupported source '{source}'")
119119

120120

121-
def get_page_number(filename: str) -> str:
121+
def get_page_number(pagefile: pathlib.Path) -> str:
122122
"""Extract and return the page number from the filename for page-level
123123
content (e.g., image or text). Returns the page number as a string
124124
with leading zeros. (Note: logic is currently
@@ -127,7 +127,7 @@ def get_page_number(filename: str) -> str:
127127
# which look like CW0112029406_00180.txt
128128

129129
# split the file base/stem name by _ and take the last part
130-
source_id, pagenum = os.path.splitext(filename)[0].split("_", 1)
130+
source_id, pagenum = pagefile.stem.split("_", 1)
131131
if get_ppa_source(source_id) != "Gale":
132132
raise NotImplementedError
133133
# return the number as a string; strip extra trailing zero

test/test_utils/test_path_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,10 @@ def test_get_volume_id():
107107

108108

109109
def test_page_number():
110-
assert get_page_number("CW0112029406_00180.txt") == "0018"
110+
assert get_page_number(pathlib.Path("CW0112029406_00180.txt")) == "0018"
111111
# raise not implemented error if source id is not Gale/ECCO
112112
with pytest.raises(NotImplementedError):
113-
assert get_page_number("uva.x002075945_00180.txt") == "0018"
113+
assert get_page_number(pathlib.Path("uva.x002075945_00180.txt")) == "0018"
114114

115115

116116
@patch("corppa.utils.path_utils.get_volume_id", return_value="vol_id")

0 commit comments

Comments
 (0)