Skip to content

Commit 5b394a3

Browse files
authored
Merge pull request #107 from Princeton-CDH/feature/ocrtext-to-json
Script to collate ocr text files by directory into one json file per directory (for Gale local OCR)
2 parents 4f9d0ca + 0913b6d commit 5b394a3

File tree

8 files changed

+499
-131
lines changed

8 files changed

+499
-131
lines changed

.github/workflows/unit_tests.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,29 @@ on:
44
push: # run on every push or PR to any branch
55
pull_request:
66

7+
env:
8+
# python version used to calculate and submit code coverage
9+
COV_PYTHON_VERSION: "3.12"
10+
711
jobs:
812
python-unit:
913
name: Python unit tests
1014
runs-on: ubuntu-latest
15+
strategy:
16+
matrix:
17+
python: ["3.11", "3.12"]
18+
defaults:
19+
run:
20+
working-directory: .
1121

1222
steps:
1323
- name: Checkout repository
1424
uses: actions/checkout@v4
1525

16-
- name: Setup Python
26+
- name: Set up Python ${{ matrix.python-version }}
1727
uses: actions/setup-python@v5
28+
with:
29+
python-version: ${{ matrix.python }}
1830

1931
# base the python cache on the hash of all pyproject.toml,
2032
# which includes python requirements.
@@ -38,6 +50,7 @@ jobs:
3850
uses: codecov/codecov-action@v4
3951
with:
4052
token: ${{ secrets.CODECOV_TOKEN }} # required
53+
# if: ${{ matrix.python == env.COV_PYTHON_VERSION }}
4154

4255
# Set the color of the slack message used in the next step based on the
4356
# status of the build: "danger" for failure, "good" for success,
@@ -58,7 +71,7 @@ jobs:
5871
env:
5972
SLACK_COLOR: ${{ env.SLACK_COLOR }}
6073
SLACK_WEBHOOK: ${{ secrets.ACTIONS_SLACK_WEBHOOK }}
61-
SLACK_TITLE: "Workflow `${{ github.workflow }}`: ${{ job.status }}"
74+
SLACK_TITLE: "Workflow `${{ github.workflow }}` (python ${{ matrix.python }}): ${{ job.status }}"
6275
SLACK_MESSAGE: "Run <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|#${{ github.run_number }}> on <https://github.com/${{ github.repository }}/|${{ github.repository }}@${{ github.ref }}>"
6376
SLACK_FOOTER: "<https://github.com/${{ github.repository }}/commit/${{ github.sha }}|View commit>"
6477
MSG_MINIMAL: true # use compact slack message format

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@ build-backend = "hatchling.build"
55
[project]
66
name = "corppa"
77
description = "Utilities for working with Princeton Prosody Archive full-text corpus"
8-
requires-python = ">=3.12"
8+
requires-python = ">=3.11"
99
readme = "README.md"
1010
# license TBD
1111
#license.file = "LICENSE"
1212
#license = {text = "Apache-2"}
1313
classifiers = [
1414
"Programming Language :: Python :: 3",
1515
"Programming Language :: Python :: 3.12",
16+
"Programming Language :: Python :: 3.11",
1617
"Development Status :: 3 - Alpha",
1718
"Intended Audience :: Science/Research",
1819
"Operating System :: OS Independent",
@@ -38,6 +39,7 @@ dev = ["pre-commit", "corppa[test]", "corppa[ocr]"]
3839
[project.scripts]
3940
corppa-filter = "corppa.utils.filter:main"
4041
corppa-ocr = "corppa.ocr.gvision_ocr:main"
42+
collate-txt = "corppa.ocr.collate_txt:main"
4143

4244
[tool.hatch.version]
4345
path = "src/corppa/__init__.py"

src/corppa/ocr/collate_txt.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/env python
2+
"""
3+
Script to turn directories with multiple text files into a single JSON
4+
file containing text contents of all files with page numbers based
5+
on text filenames. (Page number logic is currently Gale-specific).
6+
"""
7+
8+
import argparse
9+
import json
10+
import pathlib
11+
import sys
12+
13+
from tqdm import tqdm
14+
15+
from corppa.utils.path_utils import find_relative_paths, get_page_number
16+
17+
18+
def collate_txt(
19+
input_dir: pathlib.Path, output_dir: pathlib.Path, show_progress: bool = True
20+
):
21+
"""Takes a directory that contains text files grouped by directory at any
22+
level of nesting under the specified `input_dir` and combines them into
23+
one JSON file per directory. JSON files are created in the specified
24+
`output_dir` using the same hierarchy found in the `input_dir`.
25+
"""
26+
directories = 0
27+
txt_files = 0
28+
skipped = 0
29+
30+
# stack tqdm bars so we can briefly show status
31+
status = tqdm(
32+
desc="Collating",
33+
bar_format="{desc}{postfix}",
34+
disable=not show_progress,
35+
)
36+
37+
for ocr_dir, files in tqdm(
38+
find_relative_paths(input_dir, [".txt"], group_by_dir=True),
39+
desc="Directories with text files",
40+
disable=not show_progress,
41+
):
42+
# output will be a json file based on name of the directory containing text files,
43+
# with parallel directory structure to the source
44+
output_file = output_dir / ocr_dir.parent / f"{ocr_dir.name}.json"
45+
# if output exists from a previous run, skip
46+
if output_file.exists():
47+
skipped += 1
48+
continue
49+
50+
directories += 1
51+
txt_files += len(files)
52+
status.set_postfix_str(f" {ocr_dir.stem}: {len(files)} txt files")
53+
54+
# combine text contents into a dictionary keyed on page number
55+
txt_data = {}
56+
for filename in files:
57+
with (input_dir / filename).open(encoding="utf-8") as txtfile:
58+
txt_data[get_page_number(filename)] = txtfile.read()
59+
60+
# ensure the parent directory exists
61+
output_file.parent.mkdir(exist_ok=True)
62+
# save out text content as json
63+
with output_file.open("w", encoding="utf-8") as outfile:
64+
json.dump(txt_data, outfile)
65+
66+
status.set_postfix_str("")
67+
status.close()
68+
69+
# report a summary of what was done
70+
print(
71+
f"\nCreated JSON file{'' if directories == 1 else 's'} for "
72+
+ f"{directories:,} director{'y' if directories == 1 else 'ies'} "
73+
+ f"with {txt_files:,} total text files; skipped {skipped:,}."
74+
)
75+
76+
77+
def main():
78+
parser = argparse.ArgumentParser(
79+
description="Create JSON files to group OCR text files by directory."
80+
)
81+
# Required arguments
82+
parser.add_argument(
83+
"input_dir",
84+
help="Top-level input directory with directories of OCR text files.",
85+
type=pathlib.Path,
86+
)
87+
parser.add_argument(
88+
"output_dir",
89+
help="Top-level output directory for OCR consolidated into JSON files.",
90+
type=pathlib.Path,
91+
)
92+
# Optional arguments
93+
parser.add_argument(
94+
"--progress",
95+
help="Show progress",
96+
action=argparse.BooleanOptionalAction,
97+
default=True,
98+
)
99+
100+
args = parser.parse_args()
101+
# Validate arguments
102+
if not args.input_dir.is_dir():
103+
print(
104+
f"Error: input directory {args.input_dir} does not exist", file=sys.stderr
105+
)
106+
sys.exit(1)
107+
# create output dir if it doesn't exist
108+
if not args.output_dir.is_dir():
109+
try:
110+
args.output_dir.mkdir()
111+
print(f"Creating output directory {args.output_dir}")
112+
except (FileExistsError, FileNotFoundError) as err:
113+
print(
114+
f"Error creating output directory {args.output_dir}: {err}",
115+
file=sys.stderr,
116+
)
117+
sys.exit(1)
118+
119+
collate_txt(args.input_dir, args.output_dir, show_progress=args.progress)
120+
121+
122+
if __name__ == "__main__":
123+
main()

src/corppa/ocr/gvision_ocr.py

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from tqdm import tqdm
1414

15-
from corppa.utils.path_utils import get_ppa_source, get_vol_dir
15+
from corppa.utils.path_utils import find_relative_paths, get_ppa_source, get_vol_dir
1616

1717
# Attempt to import Google Cloud Vision Python Client
1818
try:
@@ -24,33 +24,6 @@
2424
os.environ["GRPC_VERBOSITY"] = "NONE"
2525

2626

27-
def image_relpath_generator(image_dir, exts, follow_symlinks=True):
28-
"""
29-
This generator method finds all images in image_dir with file extensions
30-
in exts (case insensitive). For each of these images, the method yields
31-
the relative path with respect to image_dir.
32-
33-
For example, if image_dir = "a/b/c/images" and there are image files at the
34-
following paths: "a/b/c/images/alpha.jpg", "a/b/c/images/d/beta.jpg"
35-
The generate will produce these two items: "alpha.jpg" and "d/beta.jpg"
36-
"""
37-
# Create lowercase extension set from passed in exts
38-
ext_set = {ext.lower() for ext in exts}
39-
40-
# Using pathlib.walk over glob because (1) it allows us to find files with
41-
# multiple extensions in a single walk of the directory and (2) lets us
42-
# leverage additional functionality of pathlib.
43-
for dirpath, dirs, files in image_dir.walk(follow_symlinks=follow_symlinks):
44-
# Check the files in walked directory
45-
for file in files:
46-
ext = os.path.splitext(file)[1]
47-
if ext.lower() in ext_set:
48-
filepath = dirpath.joinpath(file)
49-
yield filepath.relative_to(image_dir)
50-
# For future walking, remove hidden directories
51-
dirs[:] = [d for d in dirs if d[0] != "."]
52-
53-
5427
def ocr_image_via_gvision(gvision_client, input_image, out_txt, out_json):
5528
"""
5629
Perform OCR for input image using the Google Cloud Vision API via the provided client.
@@ -126,7 +99,7 @@ def ocr_images(in_dir, out_dir, exts, ocr_limit=0, show_progress=True):
12699

127100
ocr_count = 0
128101
skip_count = 0
129-
for image_relpath in image_relpath_generator(in_dir, exts):
102+
for image_relpath in find_relative_paths(in_dir, exts):
130103
# Refresh progress bar
131104
if show_progress:
132105
progress_bar.refresh()

src/corppa/utils/path_utils.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""
2-
Library of general-purpose auxiliary methods for stand-alone scripts
2+
General-purpose methods for working with paths, PPA identifiers, and directories
33
"""
44

5+
import os
56
import pathlib
7+
from typing import Iterator
68

79
_htid_encode_map = {":": "+", "/": "=", ".": ","}
810
_htid_encode_table = str.maketrans(_htid_encode_map)
@@ -114,3 +116,87 @@ def get_image_relpath(work_id, page_num):
114116
raise NotImplementedError
115117
else:
116118
raise ValueError(f"Unsupported source '{source}'")
119+
120+
121+
def get_page_number(pagefile: pathlib.Path) -> str:
122+
"""Extract and return the page number from the filename for page-level
123+
content (e.g., image or text). Returns the page number as a string
124+
with leading zeros. (Note: logic is currently
125+
specific to Gale/ECCO file naming conventions.)"""
126+
# NOTE: this logic is currently specific to Gale/ECCO files,
127+
# which look like CW0112029406_00180.txt
128+
129+
# split the file base/stem name by _ and take the last part
130+
source_id, pagenum = pagefile.stem.split("_", 1)
131+
if get_ppa_source(source_id) != "Gale":
132+
raise NotImplementedError
133+
# return the number as a string; strip extra trailing zero
134+
return pagenum[:-1] # strip trailing zero
135+
136+
137+
def find_relative_paths(
138+
base_dir, exts, follow_symlinks=True, group_by_dir=False
139+
) -> Iterator[pathlib.Path] | Iterator[tuple[pathlib.Path, list]]:
140+
"""
141+
This method finds files anywhere under the specified base directory
142+
that match any of the specified file extensions (case insensitive),
143+
and returns a generator of path objects with a path relative to the
144+
base directory. File extensions should include the leading period,
145+
i.e. `[".jpg", ".tiff"]` rather than `["jpg", "tiff"]`.
146+
147+
For example, given a base directory `a/b/c/images`, an extension list of `.jpg`,
148+
and files nested at different levels in the hierarchy
149+
`a/b/c/images/alpha.jpg`, `a/b/c/images/d/beta.jpg`:
150+
```
151+
a/b/c/images
152+
|-- alpha.jpg
153+
+-- d
154+
|-- beta.jpg
155+
```
156+
The result will include the two items: `alpha.jpg and `d/beta.jpg`
157+
158+
When `group_by_dir` is `True`, resulting files will be returned grouped
159+
by the parent directory. The return result is a tuple of a single :class:`pathlib.Path`
160+
object for the directory and a list of :class:`pathlib.Path` objects for the files in that
161+
directory that match the specified extensions. Given a hierarchy like this:
162+
```
163+
images/vol-a/
164+
|-- alpha.jpg
165+
|-- beta.jpg
166+
```
167+
the method would return `(vol-a, [alpha.jpg, beta.jpg])`.
168+
"""
169+
# Create lowercase extension set from passed in exts
170+
ext_set = {ext.lower() for ext in exts}
171+
172+
# Using pathlib.Path.walk / os.walk over glob because (1) it allows us to
173+
# find files with multiple extensions in a single walk of the directory
174+
# and (2) lets us leverage additional functionality of pathlib.
175+
if hasattr(base_dir, "walk"):
176+
# As of Python 3.12, Path.walk exists
177+
walk_generator = base_dir.walk(follow_symlinks=follow_symlinks)
178+
else:
179+
# For Python 3.11, fall back to os.walk
180+
walk_generator = os.walk(base_dir, followlinks=follow_symlinks)
181+
for dirpath, dirnames, filenames in walk_generator:
182+
if isinstance(dirpath, str):
183+
# Convert str produced by os.walk to Path object
184+
dirpath = pathlib.Path(dirpath)
185+
# Create a generator of relevant files in the current directory
186+
include_files = (
187+
dirpath.joinpath(file).relative_to(base_dir)
188+
for file in filenames
189+
if os.path.splitext(file)[1].lower() in ext_set
190+
)
191+
# if group by dir is specified, yield dirpath and list of files,
192+
# but only if at least one relevant file is found
193+
if group_by_dir:
194+
include_files = list(include_files)
195+
if include_files:
196+
yield (dirpath.relative_to(base_dir), include_files)
197+
else:
198+
# otherwise yield just the files
199+
yield from include_files
200+
201+
# modify dirnames in place to skip hidden directories
202+
dirnames[:] = [d for d in dirnames if not d.startswith(".")]

0 commit comments

Comments
 (0)