Skip to content

Commit

Permalink
Add multiline Japanese strings support to HocrVisualParser() to fix #534
Browse files Browse the repository at this point in the history
 and redo #537
  • Loading branch information
YasushiMiyata authored and lukehsiao committed May 7, 2021
1 parent 5ab8e9c commit 54fcfd3
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 12 deletions.
22 changes: 10 additions & 12 deletions src/fonduer/parser/visual_parser/hocr_visual_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,21 +117,19 @@ def attrib_parse(
h2s_multi_idx = [
k for k, v in h2s_multi.items() if ptr + i == v
]
start, end = 0, 0
if h2s_multi_idx: # One hOCR word-to-multi spacy tokens
start = h2s_multi_idx[0]
end = h2s_multi_idx[-1] + 1
# calculate a bbox that can include all
left = min(lefts[start:end])
top = min(tops[start:end])
right = max(rights[start:end])
bottom = max(bottoms[start:end])
ppageno = ppagenos[start]
else:
raise RuntimeError(
"Tokens are not aligned!",
f"hocr tokens: {hocr_tokens}",
f"spacy tokens: {spacy_tokens}",
)
else: # One hOCR word-to-multi spacy tokens
start = s2h_multi[i - 1 if i > 0 else 0]
end = s2h_multi[i + 1] + 1
# calculate a bbox that can include all
left = min(lefts[start:end])
top = min(tops[start:end])
right = max(rights[start:end])
bottom = max(bottoms[start:end])
ppageno = ppagenos[start]
# One-to-one mapping is available
else:
left = lefts[s2h[ptr + i]]
Expand Down
10 changes: 10 additions & 0 deletions tests/data/hocr_simple/japan.hocr
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 145 175 245 185">
<p class='ocr_par' id='par_1_2' lang='jpn' title="bbox 145 175 245 185">
<span class="ocr_line" id="line_1_2" title="bbox 145 175 225 180">
<span class="ocrx_word" id="word_1_60" title="bbox 145 175 225 180; x_wconf 92">チェーン店</span>
</span>
<span class="ocr_line" id="line_1_3" title="bbox 226 181 245 185">
<span class="ocrx_word" id="word_1_61" title="bbox 226 181 245 185; x_wconf 92">本・支店</span>
</span>
</p>
</div>
</div>
</body>
</html>
8 changes: 8 additions & 0 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,14 @@ def test_parse_hocr():
assert sent.left[1] == 150 # this left comes from "に" in hOCR
assert sent.right[1] == 249 # this right comes from "ぽん" in hOCR

sent = doc.sentences[2]
assert len(sent.words) == len(sent.left)
# "チェーン店\n本・支店" is tokenized into three: "チェーン店", "本・支店" in hOCR,
# but it is tokenized as "チェーン", "店本", "・", "支店" by spaCy.
assert sent.words[1] == "店本"
assert sent.left[1] == 145 # comes from left min of "チェーン店\n本・支店" in hOCR
assert sent.right[1] == 245 # comes from right min of "チェーン店\n本・支店" in hOCR


def test_parse_hocr_with_tables():
"""Test the parser with hOCR documents that have tables."""
Expand Down

0 comments on commit 54fcfd3

Please sign in to comment.