|
1 | 1 | """ |
2 | | -Library of general-purpose auxiliary methods for stand-alone scripts |
| 2 | +General-purpose methods for working with paths, PPA identifiers, and directories |
3 | 3 | """ |
4 | 4 |
|
| 5 | +import os |
5 | 6 | import pathlib |
| 7 | +from typing import Iterator |
6 | 8 |
|
7 | 9 | _htid_encode_map = {":": "+", "/": "=", ".": ","} |
8 | 10 | _htid_encode_table = str.maketrans(_htid_encode_map) |
@@ -114,3 +116,87 @@ def get_image_relpath(work_id, page_num): |
114 | 116 | raise NotImplementedError |
115 | 117 | else: |
116 | 118 | raise ValueError(f"Unsupported source '{source}'") |
| 119 | + |
| 120 | + |
| 121 | +def get_page_number(pagefile: pathlib.Path) -> str: |
| 122 | + """Extract and return the page number from the filename for page-level |
| 123 | + content (e.g., image or text). Returns the page number as a string |
| 124 | + with leading zeros. (Note: logic is currently |
| 125 | + specific to Gale/ECCO file naming conventions.)""" |
| 126 | + # NOTE: this logic is currently specific to Gale/ECCO files, |
| 127 | + # which look like CW0112029406_00180.txt |
| 128 | + |
| 129 | + # split the file base/stem name by _ and take the last part |
| 130 | + source_id, pagenum = pagefile.stem.split("_", 1) |
| 131 | + if get_ppa_source(source_id) != "Gale": |
| 132 | + raise NotImplementedError |
| 133 | + # return the number as a string; strip extra trailing zero |
| 134 | + return pagenum[:-1] # strip trailing zero |
| 135 | + |
| 136 | + |
| 137 | +def find_relative_paths( |
| 138 | + base_dir, exts, follow_symlinks=True, group_by_dir=False |
| 139 | +) -> Iterator[pathlib.Path] | Iterator[tuple[pathlib.Path, list]]: |
| 140 | + """ |
| 141 | + This method finds files anywhere under the specified base directory |
| 142 | + that match any of the specified file extensions (case insensitive), |
| 143 | + and returns a generator of path objects with a path relative to the |
| 144 | + base directory. File extensions should include the leading period, |
| 145 | + i.e. `[".jpg", ".tiff"]` rather than `["jpg", "tiff"]`. |
| 146 | +
|
| 147 | + For example, given a base directory `a/b/c/images`, an extension list of `.jpg`, |
| 148 | + and files nested at different levels in the hierarchy |
| 149 | + `a/b/c/images/alpha.jpg`, `a/b/c/images/d/beta.jpg`: |
| 150 | + ``` |
| 151 | + a/b/c/images |
| 152 | + |-- alpha.jpg |
| 153 | + +-- d |
| 154 | + |-- beta.jpg |
| 155 | + ``` |
| 156 | + The result will include the two items: `alpha.jpg and `d/beta.jpg` |
| 157 | +
|
| 158 | + When `group_by_dir` is `True`, resulting files will be returned grouped |
| 159 | + by the parent directory. The return result is a tuple of a single :class:`pathlib.Path` |
| 160 | + object for the directory and a list of :class:`pathlib.Path` objects for the files in that |
| 161 | + directory that match the specified extensions. Given a hierarchy like this: |
| 162 | + ``` |
| 163 | + images/vol-a/ |
| 164 | + |-- alpha.jpg |
| 165 | + |-- beta.jpg |
| 166 | + ``` |
| 167 | + the method would return `(vol-a, [alpha.jpg, beta.jpg])`. |
| 168 | + """ |
| 169 | + # Create lowercase extension set from passed in exts |
| 170 | + ext_set = {ext.lower() for ext in exts} |
| 171 | + |
| 172 | + # Using pathlib.Path.walk / os.walk over glob because (1) it allows us to |
| 173 | + # find files with multiple extensions in a single walk of the directory |
| 174 | + # and (2) lets us leverage additional functionality of pathlib. |
| 175 | + if hasattr(base_dir, "walk"): |
| 176 | + # As of Python 3.12, Path.walk exists |
| 177 | + walk_generator = base_dir.walk(follow_symlinks=follow_symlinks) |
| 178 | + else: |
| 179 | + # For Python 3.11, fall back to os.walk |
| 180 | + walk_generator = os.walk(base_dir, followlinks=follow_symlinks) |
| 181 | + for dirpath, dirnames, filenames in walk_generator: |
| 182 | + if isinstance(dirpath, str): |
| 183 | + # Convert str produced by os.walk to Path object |
| 184 | + dirpath = pathlib.Path(dirpath) |
| 185 | + # Create a generator of relevant files in the current directory |
| 186 | + include_files = ( |
| 187 | + dirpath.joinpath(file).relative_to(base_dir) |
| 188 | + for file in filenames |
| 189 | + if os.path.splitext(file)[1].lower() in ext_set |
| 190 | + ) |
| 191 | + # if group by dir is specified, yield dirpath and list of files, |
| 192 | + # but only if at least one relevant file is found |
| 193 | + if group_by_dir: |
| 194 | + include_files = list(include_files) |
| 195 | + if include_files: |
| 196 | + yield (dirpath.relative_to(base_dir), include_files) |
| 197 | + else: |
| 198 | + # otherwise yield just the files |
| 199 | + yield from include_files |
| 200 | + |
| 201 | + # modify dirnames in place to skip hidden directories |
| 202 | + dirnames[:] = [d for d in dirnames if not d.startswith(".")] |
0 commit comments