diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0dc3c2e..51e8996 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -32,7 +32,12 @@ jobs:
- name: Run tests
run: >-
pytest --cov=src/ --cov-report term-missing --doctest-modules tests
+ - name: Test cli command
+ run: >-
+ cat tests/data/PPN720183197-PHYS_0004.xml | alto-tools -t -
+ | grep -i 'stille gedanken'
+ shell: bash
- name: Build package
run: python3 -m pip install --upgrade build && python3 -m build
-...
+...
\ No newline at end of file
diff --git a/src/alto_tools/alto_tools.py b/src/alto_tools/alto_tools.py
index 4022f82..bbc0f3f 100644
--- a/src/alto_tools/alto_tools.py
+++ b/src/alto_tools/alto_tools.py
@@ -2,12 +2,14 @@
""" ALTO Tools: simple tools for performing various operations on ALTO xml files """
+from __future__ import annotations
import argparse
import codecs
import io
import os
import re
import sys
+from typing import Callable, Iterable
import xml.etree.ElementTree as ET
__version__ = "0.1.0"
@@ -253,7 +255,10 @@ def parse_arguments():
return args
-def walker(inputs, fnfilter=lambda fn: True):
+def walker(
+ inputs: Iterable[str],
+ fnfilter: Callable[[str], bool] = lambda fn: True,
+) -> Iterable[str]:
"""
Returns all file names in inputs, and recursively for directories.
@@ -266,12 +271,64 @@ def walker(inputs, fnfilter=lambda fn: True):
yield i
else:
for root, _, files in os.walk(i):
- for f in files:
- if fnfilter(f):
- yield os.path.join(root, f)
+ for f in filter(fnfilter, files):
+ yield os.path.join(root, f)
-def main():
+def open_input_file(
+ filename: str,
+ args: argparse.Namespace,
+) -> tuple[io.TextIOWrapper | str, ET.ElementTree, dict[str, str] | str] | None:
+ try:
+ if args.xml_encoding:
+ xml_encoding = args.xml_encoding
+ if xml_encoding == "auto":
+ with open(filename, "rb") as f:
+ m = re.search('encoding="(.*?)"', f.read(45).decode("utf-8"))
+ xml_encoding = m.group(1)
+ xmlp = ET.XMLParser(encoding=xml_encoding)
+ alto, xml, xmlns = alto_parse(filename, parser=xmlp)
+ else:
+ with open(filename, "r", encoding=args.file_encoding) as alto:
+ alto, xml, xmlns = alto_parse(alto)
+ except IndexError:
+ return None
+ except ET.ParseError as e:
+ print("Error parsing %s" % filename, file=sys.stderr)
+ raise e
+ return alto, xml, xmlns
+
+
+def _read_from_stdin() -> (
+ Iterable[tuple[io.TextIOWrapper | str, ET.ElementTree, dict[str, str] | str]]
+):
+ if os.isatty(0):
+ return
+ assert isinstance(sys.stdin, io.TextIOWrapper)
+ parsing_result = alto_parse(sys.stdin)
+ if not parsing_result:
+ return
+ yield parsing_result
+
+
+def open_input_files(
+ args: argparse.Namespace,
+) -> Iterable[tuple[io.TextIOWrapper | str, ET.ElementTree, dict[str, str] | str]]:
+ if "-" in args.INPUT:
+ yield from _read_from_stdin()
+ fnfilter = lambda fn: fn.endswith(".xml") or fn.endswith(".alto")
+ for filename in walker(args.INPUT, fnfilter):
+ parsing_result = open_input_file(filename, args)
+ if not parsing_result:
+ continue
+ alto, xml, xmlns = parsing_result
+ yield (alto, xml, xmlns)
+ if isinstance(alto, str):
+ continue
+ alto.close()
+
+
+def main() -> None:
if sys.version_info < (3, 0):
sys.stdout.write("Python 3 is required.\n")
sys.exit(-1)
@@ -282,28 +339,10 @@ def main():
os.system("python alto_tools.py -h")
sys.exit(-1)
else:
- fnfilter = lambda fn: fn.endswith(".xml") or fn.endswith(".alto")
- confidence_sum = 0
- for filename in walker(args.INPUT, fnfilter):
- try:
- if args.xml_encoding:
- xml_encoding = args.xml_encoding
- if xml_encoding == "auto":
- with open(filename, "rb") as f:
- m = re.search(
- 'encoding="(.*?)"', f.read(45).decode("utf-8")
- )
- xml_encoding = m.group(1)
- xmlp = ET.XMLParser(encoding=xml_encoding)
- alto, xml, xmlns = alto_parse(filename, parser=xmlp)
- else:
- with open(filename, "r", encoding=args.file_encoding) as alto:
- alto, xml, xmlns = alto_parse(alto)
- except IndexError:
- continue
- except ET.ParseError as e:
- print("Error parsing %s" % filename, file=sys.stderr)
- raise e
+ confidence_sum = 0.
+ number_of_files = 0
+ for alto, xml, xmlns in open_input_files(args):
+ number_of_files += 1
if args.confidence:
confidence_sum += alto_confidence(alto, xml, xmlns)
if args.text:
@@ -314,12 +353,10 @@ def main():
alto_graphics(alto, xml, xmlns)
if args.statistics:
alto_statistics(alto, xml, xmlns)
- number_of_files = len(list(walker(args.INPUT, fnfilter)))
- if number_of_files >= 2:
- if args.confidence:
- print(
- f"\n\nConfidence of folder: {round(confidence_sum / number_of_files, 2)}"
- )
+ if number_of_files >= 2 and args.confidence:
+ print(
+ f"\n\nConfidence of folder: {round(confidence_sum / number_of_files, 2)}"
+ )
if __name__ == "__main__":
diff --git a/tests/data/PPN750717092-00000780.ocr.xml b/tests/data/PPN750717092-00000780.ocr.xml
new file mode 100644
index 0000000..a8dd847
--- /dev/null
+++ b/tests/data/PPN750717092-00000780.ocr.xml
@@ -0,0 +1,859 @@
+
+
+
+ pixel
+
+
+ 2016-08-18
+
+ ABBYY
+ ABBYY FineReader Engine
+ 11
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_alto_tools.py b/tests/test_alto_tools.py
index 4f6c678..cfdace0 100644
--- a/tests/test_alto_tools.py
+++ b/tests/test_alto_tools.py
@@ -1,15 +1,11 @@
-# Setup sys.path. A bit ugly but avoids setting up setup.py for now.
-from pathlib import Path
-import sys
-
-sys.path.append(str(Path(__file__).resolve().parents[1]))
-
import collections
import os
import re
import tempfile
+from pathlib import Path
+
+from alto_tools import alto_tools
-from src.alto_tools import alto_tools
datadir = os.path.join(str(Path(__file__).resolve().parent), "data")
@@ -30,10 +26,12 @@ def test_alto_text(capsys):
assert re.search(r"Stille Gedanken", captured.out)
-def test_walker():
- def create_empty_file(fn):
- open(fn, "a").close()
+def create_empty_file(fn: str) -> None:
+ with open(fn, "a"):
+ ...
+
+def test_walker():
with tempfile.TemporaryDirectory() as tmpdirname:
# Create some test files
create_empty_file(os.path.join(tmpdirname, "test1.xml"))
@@ -51,7 +49,7 @@ def create_empty_file(fn):
expected = [
os.path.join(tmpdirname, "test1.xml"),
os.path.join(tmpdirname, "test1.xml"), # second instance from tmpdirname
- os.path.join(tmpdirname, "test2.xml")
+ os.path.join(tmpdirname, "test2.xml"),
# NOT 'this-should-not-be-returned'
]
assert collections.Counter(
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 0d1f126..9268677 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -7,6 +7,8 @@
from alto_tools import alto_tools
+from test_alto_tools import create_empty_file, datadir
+
def argv(args: str) -> List[str]:
"""
@@ -45,3 +47,66 @@ def test_single_file_file_encoding(
sys.argv = argv(f"-t -e iso8859-1 {fn}")
alto_tools.main()
assert "Stille Gedanken" in capsys.readouterr().out
+
+
+def test_nonexistant_file_input(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("i/dont/exist.xml -t")
+ alto_tools.main()
+ assert not capsys.readouterr().out
+
+
+def test_invalid_input_file(capsys: pytest.CaptureFixture[str]) -> None:
+ with tempfile.TemporaryDirectory() as tmpdir:
+ create_empty_file(os.path.join(tmpdir, "empty.xml"))
+ sys.argv = argv(f"{tmpdir}/empty.xml -t")
+ with pytest.raises(UnboundLocalError):
+ alto_tools.main()
+
+
+def test_single_file_confidence(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("tests/data/PPN750717092-00000780.ocr.xml -c")
+ alto_tools.main()
+ assert "Confidence: 78.29" in capsys.readouterr().out
+
+
+def test_multi_files_confidence(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("-c tests/")
+ alto_tools.main()
+ stdout = capsys.readouterr().out
+ assert "Confidence: 78.29" in stdout
+ assert "Confidence of folder: " in stdout
+
+
+def test_single_file_stats(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("-s tests/data/PPN750717092-00000780.ocr.xml")
+ alto_tools.main()
+ assert "# of elements: 60" in capsys.readouterr().out
+
+
+def test_single_file_text_extraction(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("-t tests/data/PPN750717092-00000780.ocr.xml")
+ alto_tools.main()
+ assert (
+ "Thüren, Lieferung der erforderlichen Gerüste und"
+ ) in capsys.readouterr().out
+
+
+def test_single_file_illustration_coords(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("-i tests/data/PPN720183197-PHYS_0004.xml")
+ alto_tools.main()
+ assert "Illustration: block_20=201,321,61,226" in capsys.readouterr().out
+
+
+def test_single_file_graphic_coords(capsys: pytest.CaptureFixture[str]) -> None:
+ sys.argv = argv("-g tests/data/PPN750717092-00000780.ocr.xml")
+ alto_tools.main()
+ assert "GraphicalElement: Page1_Block29=11,899,2983,549" in capsys.readouterr().out
+
+
+def test_pipe_input_xml(capsys: pytest.CaptureFixture[str]) -> None:
+ with open(os.path.join(datadir, "PPN720183197-PHYS_0004.xml")) as f:
+ sys.stdin = f
+ sys.argv = ["alto-tools", "-t", "-"]
+ alto_tools.main()
+ captured = capsys.readouterr()
+ assert "Stille Gedanken" in captured.out