hocr-eval

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# compute statistics about the quality of the geometric segmentation
# at the level of the given OCR element

import argparse
import re

from lxml import html
from PIL import Image, ImageDraw

################################################################
# library
################################################################

# XML node processing


def get_prop(node, name):
    title = node.get('title')
    if not title:
        return None
    props = title.split(';')
    for prop in props:
        (key, args) = prop.split(None, 1)
        if key == name:
            return args
    return None


def get_bbox(node):
    bbox = get_prop(node, 'bbox')
    if not bbox:
        return None
    return tuple([int(x) for x in bbox.split()])


def get_text(node):
    textnodes = node.xpath(".//text()")
    s = "".join([text for text in textnodes])
    return re.sub(r'\s+', ' ', s)


# rectangle properties


def intersect(u, v):
    # intersection of two rectangles
    r = (max(u[0], v[0]), max(u[1], v[1]), min(u[2], v[2]), min(u[3], v[3]))
    return r


def width(u):
    # width of a rectangle
    return max(0, u[2] - u[0])


def height(u):
    # height of a rectangle
    return max(0, u[3] - u[1])


def area(u):
    # area of a rectangle
    return max(0, u[2] - u[0]) * max(0, u[3] - u[1])


def erode(u, tx, ty):
    x = 2 * tx + 1
    y = 2 * ty + 1
    return tuple([u[0] + x, u[1] + y, u[2] - x, u[3] - y])


# text comparison

simp_re = re.compile(r'[^a-zA-Z0-9.,!?:;]+')


def normalize(s):
    s = simp_re.sub(' ', s)
    s = s.strip()
    return s


# edit distance


def edit_distance(a, b, threshold=99999):
    if a == b:
        return 0
    m = len(a)
    n = len(b)
    distances = [[threshold for j in range(n + 1)] for i in range(m + 1)]
    # distances is a 2-dimensional array such that distances[i][j]
    # will be equal to the edit distance of the first i characters
    # of a and the first j characters of b.
    for i in range(m + 1):
        distances[i][0] = i
    for j in range(n + 1):
        distances[0][j] = j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if a[i - 1] == b[j - 1]:
                cij = 0
            else:
                cij = 1
            d = min(distances[i - 1][j] + 1, distances[i][j - 1] + 1,
                    distances[i - 1][j - 1] + cij)
            if d >= threshold:
                return d
            distances[i][j] = d
    return distances[m][n]


# def remove_tex(text):
#     text_file = os.popen("echo %s | detex " %(text))
#     text_plain = text_file.read()
#     text_file.close()
#     return text_plain


def remove_tex(text):
    return text


################################################################
# main program
################################################################

# argument parsing

parser = argparse.ArgumentParser()
parser.add_argument(
    "truth", help="hOCR file with ground truth", type=argparse.FileType('r'))
parser.add_argument(
    "actual",
    help="hOCR file from the actual recognition",
    type=argparse.FileType('r'))
parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
# not yet supported:
# parser.add_argument("-e", "--element", default="ocr_line",
#                     help="%(default)s")
# parser.add_argument("-o", "--significant_overlap", type=float, default=0.1,
#                     help="default: %(default)s")
parser.add_argument("-i", "--imgfile", type=argparse.FileType('r'))
args = parser.parse_args()

if (args.imgfile):
    im = Image.open(args.imgfile)
    print(im.size, im.format, im.mode)
    draw = ImageDraw.Draw(im)

# get pages from inputs
truth_doc = html.parse(args.truth)
actual_doc = html.parse(args.actual)

# parse pages
truth_pages = truth_doc.xpath("//*[@class='ocr_page']")
actual_pages = actual_doc.xpath("//*[@class='ocr_page']")

# zip ground-truth and ocr result pages
assert len(truth_pages) == len(actual_pages)
pages = zip(truth_pages, actual_pages)

segmentation_errors = 0
segmentation_ocr_errors = 0
ocr_errors = 0

# relative and absolute thresholds in vertical and horizontal direction
HTOL = 90
VTOL = 80
HPIX = 5
VPIX = 5

used = {}

for truth, actual in pages:
    true_lines = truth.xpath("//*[@class='ocr_line']")
    actual_lines = actual.xpath("//*[@class='ocr_line']")
    tx = [
        min(HPIX, (100 - HTOL) * width(get_bbox(line)) / 100)
        for line in true_lines
    ]
    ty = [
        min(VPIX, (100 - VTOL) * height(get_bbox(line)) / 100)
        for line in true_lines
    ]
    for index, true_line in enumerate(true_lines):
        bbox = get_bbox(true_line)
        bbox_small = erode(bbox, tx[index], ty[index])
        candidates = [(area(intersect(get_bbox(line), bbox)), get_bbox(line),
                       get_text(line)) for line in actual_lines]
        q = 0
        tight_overlap = False
        if candidates != []:
            q, actual_bbox, actual_line = max(candidates)
            actual_bbox_small = erode(actual_bbox, tx[index], ty[index])
            if (area(intersect(actual_bbox_small, bbox)) == area(
                    actual_bbox_small)
                    and area(intersect(actual_bbox, bbox_small)) == area(
                        bbox_small)):
                tight_overlap = True

        if (tight_overlap == 0):
            if args.verbose:
                print("segmentation_error: area_overlap =",
                      q * 1.0 / area(bbox), "true_bbox", bbox)
                print("\t", get_text(true_line))
            segmentation_errors += 1

            if candidates != []:
                true_text = remove_tex(get_text(true_line))
                segmentation_ocr_errors += edit_distance(
                    normalize(true_text), normalize(actual_line))
            else:
                segmentation_ocr_errors += len(get_text(true_line))

            if (args.imgfile):
                draw.rectangle(bbox, outline="#ff0000")
                if candidates != []:
                    draw.rectangle(actual_bbox, outline="#0000ff")
            continue
        true_text = remove_tex(get_text(true_line))
        actual_text = actual_line
        if args.debug:
            print("overlap", q, "true_bbox", bbox)
            print("\t", true_text)
            print("\t", actual_text)
        error = edit_distance(normalize(true_text), normalize(actual_text))
        if args.verbose and error > 0:
            print("ocr_error", error, "true_bbox", bbox)
            print("\t", true_text)
            print("\t", actual_text)
        ocr_errors += error

print("segmentation_errors", segmentation_errors)
print("segmentation_ocr_errors", segmentation_ocr_errors)
print("ocr_errors", ocr_errors)
if (args.imgfile):
    im.save("errors.png")
    im.show("errors.png")