Add multiline Japanese strings support to HocrVisualParser() to fix #534

and redo #537
HazyResearch · May 7, 2021 · 54fcfd3 · 54fcfd3
1 parent 5ab8e9c
commit 54fcfd3
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 12 deletions.
diff --git a/src/fonduer/parser/visual_parser/hocr_visual_parser.py b/src/fonduer/parser/visual_parser/hocr_visual_parser.py
@@ -117,21 +117,19 @@ def attrib_parse(
                             h2s_multi_idx = [
                                 k for k, v in h2s_multi.items() if ptr + i == v
                             ]
+                            start, end = 0, 0
                             if h2s_multi_idx:  # One hOCR word-to-multi spacy tokens
                                 start = h2s_multi_idx[0]
                                 end = h2s_multi_idx[-1] + 1
-                                # calculate a bbox that can include all
-                                left = min(lefts[start:end])
-                                top = min(tops[start:end])
-                                right = max(rights[start:end])
-                                bottom = max(bottoms[start:end])
-                                ppageno = ppagenos[start]
-                            else:
-                                raise RuntimeError(
-                                    "Tokens are not aligned!",
-                                    f"hocr tokens: {hocr_tokens}",
-                                    f"spacy tokens: {spacy_tokens}",
-                                )
+                            else:  # One hOCR word-to-multi spacy tokens
+                                start = s2h_multi[i - 1 if i > 0 else 0]
+                                end = s2h_multi[i + 1] + 1
+                            # calculate a bbox that can include all
+                            left = min(lefts[start:end])
+                            top = min(tops[start:end])
+                            right = max(rights[start:end])
+                            bottom = max(bottoms[start:end])
+                            ppageno = ppagenos[start]
                     # One-to-one mapping is available
                     else:
                         left = lefts[s2h[ptr + i]]

diff --git a/tests/data/hocr_simple/japan.hocr b/tests/data/hocr_simple/japan.hocr
@@ -75,6 +75,16 @@
      </span>
     </p>
    </div>
+   <div class='ocr_carea' id='block_1_2' title="bbox 145 175 245 185">
+    <p class='ocr_par' id='par_1_2' lang='jpn' title="bbox 145 175 245 185">
+     <span class="ocr_line" id="line_1_2" title="bbox 145 175 225 180">
+      <span class="ocrx_word" id="word_1_60" title="bbox 145 175 225 180; x_wconf 92">チェーン店</span>
+     </span>
+     <span class="ocr_line" id="line_1_3" title="bbox 226 181 245 185">
+      <span class="ocrx_word" id="word_1_61" title="bbox 226 181 245 185; x_wconf 92">本・支店</span>
+     </span>
+    </p>
+   </div>
   </div>
  </body>
 </html>
diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py
@@ -953,6 +953,14 @@ def test_parse_hocr():
     assert sent.left[1] == 150  # this left comes from "に" in hOCR
     assert sent.right[1] == 249  # this right comes from "ぽん" in hOCR
 
+    sent = doc.sentences[2]
+    assert len(sent.words) == len(sent.left)
+    # "チェーン店\n本・支店" is tokenized into three: "チェーン店", "本・支店" in hOCR,
+    # but it is tokenized as "チェーン", "店本", "・", "支店" by spaCy.
+    assert sent.words[1] == "店本"
+    assert sent.left[1] == 145  # comes from left min of "チェーン店\n本・支店" in hOCR
+    assert sent.right[1] == 245  # comes from right min of "チェーン店\n本・支店" in hOCR
+
 
 def test_parse_hocr_with_tables():
     """Test the parser with hOCR documents that have tables."""