Skip to content

Commit

Permalink
Move page number method to path utils
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Nov 4, 2024
1 parent 63f40a8 commit 014b01f
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 20 deletions.
16 changes: 1 addition & 15 deletions src/corppa/ocr/collate_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,7 @@

from tqdm import tqdm

from corppa.utils.path_utils import find_relative_paths


def page_number(filename: pathlib.Path) -> str:
"""Extract and return the page number from a :class:`pathlib.Path`
file for content from a single page (e.g., image or text). Returns the
page number as a string with leading zeros. (Note: logic is curently
specific to Gale/ECCO file naming conventions.)"""
# NOTE: this logic is currently specific to Gale/ECCO files,
# which look like CW0112029406_00180.txt

# split the file base/stem name by _ and take the last part
pagenum = filename.stem.split("_")[-1]
# return the number as a string; strip extra trailing zero
return pagenum[:-1] # strip trailing zero
from corppa.utils.path_utils import find_relative_paths, page_number


def collate_txt(
Expand Down
14 changes: 14 additions & 0 deletions src/corppa/utils/path_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,20 @@ def get_image_relpath(work_id, page_num):
raise ValueError(f"Unsupported source '{source}'")


def page_number(filename: pathlib.Path) -> str:
"""Extract and return the page number from a :class:`pathlib.Path`
file for content from a single page (e.g., image or text). Returns the
page number as a string with leading zeros. (Note: logic is curently
specific to Gale/ECCO file naming conventions.)"""
# NOTE: this logic is currently specific to Gale/ECCO files,
# which look like CW0112029406_00180.txt

# split the file base/stem name by _ and take the last part
pagenum = filename.stem.split("_")[-1]
# return the number as a string; strip extra trailing zero
return pagenum[:-1] # strip trailing zero


def find_relative_paths(
base_dir, exts, follow_symlinks=True, group_by_dir=False
) -> Iterator[pathlib.Path] | Iterator[tuple[pathlib.Path, list]]:
Expand Down
6 changes: 1 addition & 5 deletions test/test_ocr/test_collate_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@

import pytest

from corppa.ocr.collate_txt import collate_txt, main, page_number


def test_page_number():
assert page_number(pathlib.Path("CW0112029406_00180.txt")) == "0018"
from corppa.ocr.collate_txt import collate_txt, main


def test_collate_txt(tmp_path, capsys):
Expand Down
5 changes: 5 additions & 0 deletions test/test_utils/test_path_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
get_stub_dir,
get_vol_dir,
get_volume_id,
page_number,
)


Expand Down Expand Up @@ -105,6 +106,10 @@ def test_get_volume_id():
assert get_volume_id("coo1.ark:/13960/t4bp0n867-p3") == "coo1.ark:/13960/t4bp0n867"


def test_page_number():
assert page_number(pathlib.Path("CW0112029406_00180.txt")) == "0018"


@patch("corppa.utils.path_utils.get_volume_id", return_value="vol_id")
@patch("corppa.utils.path_utils.get_vol_dir", return_value=pathlib.Path("vol_dir"))
@patch("corppa.utils.path_utils.get_ppa_source")
Expand Down

0 comments on commit 014b01f

Please sign in to comment.