diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0dc3c2e..51e8996 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,7 +32,12 @@ jobs: - name: Run tests run: >- pytest --cov=src/ --cov-report term-missing --doctest-modules tests + - name: Test cli command + run: >- + cat tests/data/PPN720183197-PHYS_0004.xml | alto-tools -t - + | grep -i 'stille gedanken' + shell: bash - name: Build package run: python3 -m pip install --upgrade build && python3 -m build -... +... \ No newline at end of file diff --git a/src/alto_tools/alto_tools.py b/src/alto_tools/alto_tools.py index 4022f82..bbc0f3f 100644 --- a/src/alto_tools/alto_tools.py +++ b/src/alto_tools/alto_tools.py @@ -2,12 +2,14 @@ """ ALTO Tools: simple tools for performing various operations on ALTO xml files """ +from __future__ import annotations import argparse import codecs import io import os import re import sys +from typing import Callable, Iterable import xml.etree.ElementTree as ET __version__ = "0.1.0" @@ -253,7 +255,10 @@ def parse_arguments(): return args -def walker(inputs, fnfilter=lambda fn: True): +def walker( + inputs: Iterable[str], + fnfilter: Callable[[str], bool] = lambda fn: True, +) -> Iterable[str]: """ Returns all file names in inputs, and recursively for directories. @@ -266,12 +271,64 @@ def walker(inputs, fnfilter=lambda fn: True): yield i else: for root, _, files in os.walk(i): - for f in files: - if fnfilter(f): - yield os.path.join(root, f) + for f in filter(fnfilter, files): + yield os.path.join(root, f) -def main(): +def open_input_file( + filename: str, + args: argparse.Namespace, +) -> tuple[io.TextIOWrapper | str, ET.ElementTree, dict[str, str] | str] | None: + try: + if args.xml_encoding: + xml_encoding = args.xml_encoding + if xml_encoding == "auto": + with open(filename, "rb") as f: + m = re.search('encoding="(.*?)"', f.read(45).decode("utf-8")) + xml_encoding = m.group(1) + xmlp = ET.XMLParser(encoding=xml_encoding) + alto, xml, xmlns = alto_parse(filename, parser=xmlp) + else: + with open(filename, "r", encoding=args.file_encoding) as alto: + alto, xml, xmlns = alto_parse(alto) + except IndexError: + return None + except ET.ParseError as e: + print("Error parsing %s" % filename, file=sys.stderr) + raise e + return alto, xml, xmlns + + +def _read_from_stdin() -> ( + Iterable[tuple[io.TextIOWrapper | str, ET.ElementTree, dict[str, str] | str]] +): + if os.isatty(0): + return + assert isinstance(sys.stdin, io.TextIOWrapper) + parsing_result = alto_parse(sys.stdin) + if not parsing_result: + return + yield parsing_result + + +def open_input_files( + args: argparse.Namespace, +) -> Iterable[tuple[io.TextIOWrapper | str, ET.ElementTree, dict[str, str] | str]]: + if "-" in args.INPUT: + yield from _read_from_stdin() + fnfilter = lambda fn: fn.endswith(".xml") or fn.endswith(".alto") + for filename in walker(args.INPUT, fnfilter): + parsing_result = open_input_file(filename, args) + if not parsing_result: + continue + alto, xml, xmlns = parsing_result + yield (alto, xml, xmlns) + if isinstance(alto, str): + continue + alto.close() + + +def main() -> None: if sys.version_info < (3, 0): sys.stdout.write("Python 3 is required.\n") sys.exit(-1) @@ -282,28 +339,10 @@ def main(): os.system("python alto_tools.py -h") sys.exit(-1) else: - fnfilter = lambda fn: fn.endswith(".xml") or fn.endswith(".alto") - confidence_sum = 0 - for filename in walker(args.INPUT, fnfilter): - try: - if args.xml_encoding: - xml_encoding = args.xml_encoding - if xml_encoding == "auto": - with open(filename, "rb") as f: - m = re.search( - 'encoding="(.*?)"', f.read(45).decode("utf-8") - ) - xml_encoding = m.group(1) - xmlp = ET.XMLParser(encoding=xml_encoding) - alto, xml, xmlns = alto_parse(filename, parser=xmlp) - else: - with open(filename, "r", encoding=args.file_encoding) as alto: - alto, xml, xmlns = alto_parse(alto) - except IndexError: - continue - except ET.ParseError as e: - print("Error parsing %s" % filename, file=sys.stderr) - raise e + confidence_sum = 0. + number_of_files = 0 + for alto, xml, xmlns in open_input_files(args): + number_of_files += 1 if args.confidence: confidence_sum += alto_confidence(alto, xml, xmlns) if args.text: @@ -314,12 +353,10 @@ def main(): alto_graphics(alto, xml, xmlns) if args.statistics: alto_statistics(alto, xml, xmlns) - number_of_files = len(list(walker(args.INPUT, fnfilter))) - if number_of_files >= 2: - if args.confidence: - print( - f"\n\nConfidence of folder: {round(confidence_sum / number_of_files, 2)}" - ) + if number_of_files >= 2 and args.confidence: + print( + f"\n\nConfidence of folder: {round(confidence_sum / number_of_files, 2)}" + ) if __name__ == "__main__": diff --git a/tests/data/PPN750717092-00000780.ocr.xml b/tests/data/PPN750717092-00000780.ocr.xml new file mode 100644 index 0000000..a8dd847 --- /dev/null +++ b/tests/data/PPN750717092-00000780.ocr.xml @@ -0,0 +1,859 @@ + + + + pixel + + + 2016-08-18 + + ABBYY + ABBYY FineReader Engine + 11 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_alto_tools.py b/tests/test_alto_tools.py index 4f6c678..cfdace0 100644 --- a/tests/test_alto_tools.py +++ b/tests/test_alto_tools.py @@ -1,15 +1,11 @@ -# Setup sys.path. A bit ugly but avoids setting up setup.py for now. -from pathlib import Path -import sys - -sys.path.append(str(Path(__file__).resolve().parents[1])) - import collections import os import re import tempfile +from pathlib import Path + +from alto_tools import alto_tools -from src.alto_tools import alto_tools datadir = os.path.join(str(Path(__file__).resolve().parent), "data") @@ -30,10 +26,12 @@ def test_alto_text(capsys): assert re.search(r"Stille Gedanken", captured.out) -def test_walker(): - def create_empty_file(fn): - open(fn, "a").close() +def create_empty_file(fn: str) -> None: + with open(fn, "a"): + ... + +def test_walker(): with tempfile.TemporaryDirectory() as tmpdirname: # Create some test files create_empty_file(os.path.join(tmpdirname, "test1.xml")) @@ -51,7 +49,7 @@ def create_empty_file(fn): expected = [ os.path.join(tmpdirname, "test1.xml"), os.path.join(tmpdirname, "test1.xml"), # second instance from tmpdirname - os.path.join(tmpdirname, "test2.xml") + os.path.join(tmpdirname, "test2.xml"), # NOT 'this-should-not-be-returned' ] assert collections.Counter( diff --git a/tests/test_cli.py b/tests/test_cli.py index 0d1f126..9268677 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,8 @@ from alto_tools import alto_tools +from test_alto_tools import create_empty_file, datadir + def argv(args: str) -> List[str]: """ @@ -45,3 +47,66 @@ def test_single_file_file_encoding( sys.argv = argv(f"-t -e iso8859-1 {fn}") alto_tools.main() assert "Stille Gedanken" in capsys.readouterr().out + + +def test_nonexistant_file_input(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("i/dont/exist.xml -t") + alto_tools.main() + assert not capsys.readouterr().out + + +def test_invalid_input_file(capsys: pytest.CaptureFixture[str]) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + create_empty_file(os.path.join(tmpdir, "empty.xml")) + sys.argv = argv(f"{tmpdir}/empty.xml -t") + with pytest.raises(UnboundLocalError): + alto_tools.main() + + +def test_single_file_confidence(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("tests/data/PPN750717092-00000780.ocr.xml -c") + alto_tools.main() + assert "Confidence: 78.29" in capsys.readouterr().out + + +def test_multi_files_confidence(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("-c tests/") + alto_tools.main() + stdout = capsys.readouterr().out + assert "Confidence: 78.29" in stdout + assert "Confidence of folder: " in stdout + + +def test_single_file_stats(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("-s tests/data/PPN750717092-00000780.ocr.xml") + alto_tools.main() + assert "# of elements: 60" in capsys.readouterr().out + + +def test_single_file_text_extraction(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("-t tests/data/PPN750717092-00000780.ocr.xml") + alto_tools.main() + assert ( + "Thüren, Lieferung der erforderlichen Gerüste und" + ) in capsys.readouterr().out + + +def test_single_file_illustration_coords(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("-i tests/data/PPN720183197-PHYS_0004.xml") + alto_tools.main() + assert "Illustration: block_20=201,321,61,226" in capsys.readouterr().out + + +def test_single_file_graphic_coords(capsys: pytest.CaptureFixture[str]) -> None: + sys.argv = argv("-g tests/data/PPN750717092-00000780.ocr.xml") + alto_tools.main() + assert "GraphicalElement: Page1_Block29=11,899,2983,549" in capsys.readouterr().out + + +def test_pipe_input_xml(capsys: pytest.CaptureFixture[str]) -> None: + with open(os.path.join(datadir, "PPN720183197-PHYS_0004.xml")) as f: + sys.stdin = f + sys.argv = ["alto-tools", "-t", "-"] + alto_tools.main() + captured = capsys.readouterr() + assert "Stille Gedanken" in captured.out