Skip to content

Commit

Permalink
Updated HathiTrust directory structure (#114)
Browse files Browse the repository at this point in the history
* Added support for HathiTrust stub and volume directories in path_utils

* Additional refactoring of path_utils

* Added ruff as a dev dependency

---------

Co-authored-by: Rebecca Sutton Koeser <[email protected]>
  • Loading branch information
laurejt and rlskoeser authored Nov 6, 2024
1 parent 5b394a3 commit 430eba1
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 66 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ test = [
"pytest-cov"
]
ocr = ["google-cloud-vision"]
dev = ["pre-commit", "corppa[test]", "corppa[ocr]"]
dev = ["pre-commit", "ruff", "corppa[test]", "corppa[ocr]"]

[project.scripts]
corppa-filter = "corppa.utils.filter:main"
Expand Down
55 changes: 29 additions & 26 deletions src/corppa/utils/path_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
"""

import os
import pathlib
from typing import Iterator
from pathlib import Path
from typing import Iterator, List

_htid_encode_map = {":": "+", "/": "=", ".": ","}
_htid_encode_table = str.maketrans(_htid_encode_map)
_htid_decode_map = {v: k for k, v in _htid_encode_map.items()}
_htid_decode_table = str.maketrans(_htid_decode_map)


def encode_htid(htid):
def encode_htid(htid: str) -> str:
"""
Returns the "clean" version of a HathiTrust volume identifier with the form:
[library id].[volume id]
Expand All @@ -26,7 +26,7 @@ def encode_htid(htid):
return f"{lib_id}.{vol_id}"


def decode_htid(encoded_htid):
def decode_htid(encoded_htid: str) -> str:
"""
Return original HathiTrust volume identifier from encoded version:
[library id].[encoded volume id]
Expand All @@ -40,7 +40,7 @@ def decode_htid(encoded_htid):
return f"{lib_id}.{vol_id}"


def get_ppa_source(vol_id):
def get_ppa_source(vol_id: str) -> str:
"""
For a given volume id, return the corresponding source.
Assume:
Expand All @@ -56,42 +56,42 @@ def get_ppa_source(vol_id):
raise ValueError(f"Can't identify source for volume '{vol_id}'")


def get_stub_dir(source, vol_id):
def get_stub_dir(source: str, vol_id: str) -> Path:
"""
Returns the stub directory name for the specified volume (vol_id) and
source type (source)
Returns the stub directory path (pathlib.Path) for the specified volume (vol_id)
For Gale, every third number (excluding the leading 0) of the volume
identifier is used.
For Gale, the path is formed from every third number (excluding the leading 0)
of the volume identifier.
Ex. CB0127060085 --> 100
For HathiTrust, the library portion of the volume identifier is used.
Ex. mdp.39015003633594 --> mdp
For HathiTrust, we use the Stubbytree directory specification created by HTRC.
The path is composed of two directories: (1) the library portion of the volume
identifier and (2) every third character of the encoded volume identifier.
Ex. mdp.39015003633594 --> mdp/31039
"""
if source == "Gale":
return vol_id[::3][1:]
return Path(vol_id[::3][1:])
elif source == "HathiTrust":
return vol_id.split(".", maxsplit=1)[0]
lib_id, vol_id = encode_htid(vol_id).split(".", 1)
return Path(lib_id, vol_id[::3])
else:
raise ValueError(f"Unknown source '{source}'")


def get_vol_dir(vol_id):
def get_vol_dir(vol_id: str) -> Path:
"""
Returns the volume directory (pathlib.Path) for the specified volume (vol_id)
"""
source = get_ppa_source(vol_id)
if source == "Gale":
return pathlib.Path(source, get_stub_dir(source, vol_id), vol_id)
return Path(source, get_stub_dir(source, vol_id), vol_id)
elif source == "HathiTrust":
# TODO: This does not match tigerdata
# return pathlib.Path(source, get_stub_dir(source, vol_id), encode_htid(vol_id))
raise NotImplementedError(f"{source} volume directory conventions TBD")
return Path(source, get_stub_dir(source, vol_id), encode_htid(vol_id))
else:
raise ValueError(f"Unknown source '{source}'")


def get_volume_id(work_id):
def get_volume_id(work_id: str) -> str:
"""
Extract volume id from PPA work id
Expand All @@ -102,7 +102,7 @@ def get_volume_id(work_id):
return work_id.rsplit("-p", 1)[0]


def get_image_relpath(work_id, page_num):
def get_image_relpath(work_id: str, page_num: int) -> Path:
"""
Get the (relative) image path for specified PPA work page
"""
Expand All @@ -118,7 +118,7 @@ def get_image_relpath(work_id, page_num):
raise ValueError(f"Unsupported source '{source}'")


def get_page_number(pagefile: pathlib.Path) -> str:
def get_page_number(pagefile: Path) -> str:
"""Extract and return the page number from the filename for page-level
content (e.g., image or text). Returns the page number as a string
with leading zeros. (Note: logic is currently
Expand All @@ -135,8 +135,11 @@ def get_page_number(pagefile: pathlib.Path) -> str:


def find_relative_paths(
base_dir, exts, follow_symlinks=True, group_by_dir=False
) -> Iterator[pathlib.Path] | Iterator[tuple[pathlib.Path, list]]:
base_dir: Path,
exts: List[str],
follow_symlinks: bool = True,
group_by_dir: bool = False,
) -> Iterator[Path] | Iterator[tuple[Path, list]]:
"""
This method finds files anywhere under the specified base directory
that match any of the specified file extensions (case insensitive),
Expand All @@ -157,7 +160,7 @@ def find_relative_paths(
When `group_by_dir` is `True`, resulting files will be returned grouped
by the parent directory. The return result is a tuple of a single :class:`pathlib.Path`
object for the directory and a list of :class:`pathlib.Path` objects for the files in that
bject for the directory and a list of :class:`pathlib.Path` objects for the files in that
directory that match the specified extensions. Given a hierarchy like this:
```
images/vol-a/
Expand All @@ -181,7 +184,7 @@ def find_relative_paths(
for dirpath, dirnames, filenames in walk_generator:
if isinstance(dirpath, str):
# Convert str produced by os.walk to Path object
dirpath = pathlib.Path(dirpath)
dirpath = Path(dirpath)
# Create a generator of relevant files in the current directory
include_files = (
dirpath.joinpath(file).relative_to(base_dir)
Expand Down
72 changes: 33 additions & 39 deletions test/test_utils/test_path_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pathlib
from pathlib import Path
from types import GeneratorType
from unittest.mock import patch

Expand Down Expand Up @@ -53,9 +53,9 @@ def test_get_ppa_source():

def test_get_stub_dir():
# Gale
assert get_stub_dir("Gale", "CB0127060085") == "100"
assert get_stub_dir("Gale", "CB0127060085") == Path("100")
# HathiTrust
assert get_stub_dir("HathiTrust", "mdp.39015003633594") == "mdp"
assert get_stub_dir("HathiTrust", "mdp.39015003633594") == Path("mdp", "31039")
# Other
with pytest.raises(ValueError, match="Unknown source 'invalid src'"):
get_stub_dir("invalid src", "xxx0000")
Expand All @@ -66,23 +66,21 @@ def test_get_stub_dir():
def test_get_vol_dir_gale(mock_get_ppa_source, mock_get_stub_dir):
# Set returned source value to Gale
mock_get_ppa_source.return_value = "Gale"
assert get_vol_dir("gale_id") == pathlib.Path("Gale", "stub_name", "gale_id")
mock_get_ppa_source.assert_called_with("gale_id")
mock_get_stub_dir.assert_called_with("Gale", "gale_id")
assert get_vol_dir("gale_id") == Path("Gale", "stub_name", "gale_id")
mock_get_ppa_source.assert_called_once_with("gale_id")
mock_get_stub_dir.assert_called_once_with("Gale", "gale_id")


@patch("corppa.utils.path_utils.encode_htid", return_value="encoded_htid")
@patch("corppa.utils.path_utils.get_stub_dir", return_value="stub_name")
@patch("corppa.utils.path_utils.get_ppa_source")
def test_get_vol_dir_hathi(mock_get_ppa_source, mock_get_stub_dir):
def test_get_vol_dir_hathi(mock_get_ppa_source, mock_get_stub_dir, mock_encode_htid):
# Set returned source value to HathiTrust
mock_get_ppa_source.return_value = "HathiTrust"
# TODO: Update once HathiTrust directory conventions are finalized
with pytest.raises(
NotImplementedError, match="HathiTrust volume directory conventions TBD"
):
get_vol_dir("htid")
mock_get_ppa_source.assert_called_with("htid")
mock_get_stub_dir.assert_not_called()
assert get_vol_dir("htid") == Path("HathiTrust", "stub_name", "encoded_htid")
mock_get_ppa_source.assert_called_once_with("htid")
mock_get_stub_dir.assert_called_once_with("HathiTrust", "htid")
mock_encode_htid.assert_called_once_with("htid")


@patch("corppa.utils.path_utils.get_stub_dir", return_value="stub_name")
Expand All @@ -92,7 +90,7 @@ def test_get_vol_dir_unk(mock_get_ppa_source, mock_get_stub_dir):
mock_get_ppa_source.return_value = "Unknown"
with pytest.raises(ValueError, match="Unknown source 'Unknown'"):
get_vol_dir("vol_id")
mock_get_ppa_source.assert_called_with("vol_id")
mock_get_ppa_source.assert_called_once_with("vol_id")
mock_get_stub_dir.assert_not_called()


Expand All @@ -107,24 +105,20 @@ def test_get_volume_id():


def test_page_number():
assert get_page_number(pathlib.Path("CW0112029406_00180.txt")) == "0018"
assert get_page_number(Path("CW0112029406_00180.txt")) == "0018"
# raise not implemented error if source id is not Gale/ECCO
with pytest.raises(NotImplementedError):
assert get_page_number(pathlib.Path("uva.x002075945_00180.txt")) == "0018"
assert get_page_number(Path("uva.x002075945_00180.txt")) == "0018"


@patch("corppa.utils.path_utils.get_volume_id", return_value="vol_id")
@patch("corppa.utils.path_utils.get_vol_dir", return_value=pathlib.Path("vol_dir"))
@patch("corppa.utils.path_utils.get_vol_dir", return_value=Path("vol_dir"))
@patch("corppa.utils.path_utils.get_ppa_source")
def test_get_image_relpath(mock_get_ppa_source, mock_get_vol_dir, mock_get_volume_id):
# Gale
mock_get_ppa_source.return_value = "Gale"
assert get_image_relpath("test_id", 4) == pathlib.Path(
"vol_dir", "vol_id_00040.TIF"
)
assert get_image_relpath("test_id", 100) == pathlib.Path(
"vol_dir", "vol_id_01000.TIF"
)
assert get_image_relpath("test_id", 4) == Path("vol_dir", "vol_id_00040.TIF")
assert get_image_relpath("test_id", 100) == Path("vol_dir", "vol_id_01000.TIF")

# HathiTrust
mock_get_ppa_source.return_value = "HathiTrust"
Expand All @@ -138,9 +132,9 @@ def test_get_image_relpath(mock_get_ppa_source, mock_get_vol_dir, mock_get_volum


def test_find_relative_paths(tmp_path):
jpg_a = pathlib.Path("a.jpg")
jpg_a = Path("a.jpg")
tmp_path.joinpath(jpg_a).touch()
txt_b = pathlib.Path("b.txt")
txt_b = Path("b.txt")
tmp_path.joinpath(txt_b).touch()

# I. Single ext
Expand All @@ -149,13 +143,13 @@ def test_find_relative_paths(tmp_path):
assert [jpg_a] == list(paths)

# II. Multiple extensions
tif_c = pathlib.Path("c.tif")
tif_c = Path("c.tif")
tmp_path.joinpath(tif_c).touch()
paths = list(find_relative_paths(tmp_path, [".jpg", ".tif"]))
assert {jpg_a, tif_c} == set(paths)

# III. Extension handling is case insensitive
jpg_d = pathlib.Path("d.JPG")
jpg_d = Path("d.JPG")
tmp_path.joinpath(jpg_d).touch()
paths_a = list(find_relative_paths(tmp_path, [".jpg"]))
paths_b = list(find_relative_paths(tmp_path, [".JPG"]))
Expand All @@ -166,24 +160,24 @@ def test_find_relative_paths(tmp_path):
def test_find_relative_paths_nested(tmp_path):
img_dir = tmp_path.joinpath(tmp_path, "images")
img_dir.mkdir()
jpg_a = pathlib.Path("a.jpg")
jpg_a = Path("a.jpg")
tmp_path.joinpath(jpg_a).touch()
jpg_b = pathlib.Path("b.jpg")
jpg_b = Path("b.jpg")
img_dir.joinpath(jpg_b).touch()

paths = find_relative_paths(img_dir, [".jpg"])
assert {jpg_b} == set(paths)

paths = find_relative_paths(tmp_path, [".jpg"])
assert {jpg_a, pathlib.Path("images", "b.jpg")} == set(paths)
assert {jpg_a, Path("images", "b.jpg")} == set(paths)


def test_image_relpath_hidden_dirs(tmp_path):
jpg_a = pathlib.Path("a.jpg")
jpg_a = Path("a.jpg")
tmp_path.joinpath(jpg_a).touch()
hidden_dir = tmp_path.joinpath(".hidden")
hidden_dir.mkdir()
jpg_b = pathlib.Path("b.jpg")
jpg_b = Path("b.jpg")
hidden_dir.joinpath(jpg_b).touch()

paths = list(find_relative_paths(tmp_path, [".jpg"]))
Expand All @@ -210,11 +204,11 @@ def test_find_relative_paths_symbolic_links(tmp_path):
dir_c = dir_b.joinpath("dir_c")
dir_c.mkdir()
# Create files
jpg_a = pathlib.Path("a.jpg")
jpg_a = Path("a.jpg")
dir_a.joinpath(jpg_a).touch()
jpg_b = pathlib.Path("b.jpg")
jpg_b = Path("b.jpg")
dir_b.joinpath(jpg_b).touch()
jpg_c = pathlib.Path("c.jpg")
jpg_c = Path("c.jpg")
dir_c.joinpath(jpg_c).touch()
# Create symbolic links
sym_b = dir_a.joinpath("b.jpg")
Expand All @@ -224,7 +218,7 @@ def test_find_relative_paths_symbolic_links(tmp_path):

# Default follows symbolic links
paths = list(find_relative_paths(dir_a, [".jpg"]))
assert {jpg_a, jpg_b, pathlib.Path("c", "c.jpg")} == set(paths)
assert {jpg_a, jpg_b, Path("c", "c.jpg")} == set(paths)

# Do not follow symbolic links
paths = list(find_relative_paths(dir_a, [".jpg"], follow_symlinks=False))
Expand Down Expand Up @@ -253,7 +247,7 @@ def test_find_relative_paths_group_by_dir(tmp_path):

# first yielded item (or dictionary key) should be dir as path
included_dirs = list(dir_paths.keys())
assert all(isinstance(dirpath, pathlib.Path) for dirpath in included_dirs)
assert all(isinstance(dirpath, Path) for dirpath in included_dirs)
# should include relative path versions of directories with text files
relative_vol1 = vol1_dir.relative_to(ocr_dir)
assert relative_vol1 in included_dirs
Expand All @@ -264,7 +258,7 @@ def test_find_relative_paths_group_by_dir(tmp_path):

# for each volume dir, yielded items should be a list of relative paths
assert isinstance(dir_paths[relative_vol1], list)
assert all(isinstance(file, pathlib.Path) for file in dir_paths[relative_vol1])
assert all(isinstance(file, Path) for file in dir_paths[relative_vol1])
# we expect four files in both groups
assert len(dir_paths[relative_vol1]) == 4
assert len(dir_paths[relative_vol2]) == 4
Expand Down

0 comments on commit 430eba1

Please sign in to comment.