Skip to content

Commit 5f00bbc

Browse files
committed
Fix PDFs with no paragraphs issue
1 parent 3c0db31 commit 5f00bbc

File tree

4 files changed

+50
-6
lines changed

4 files changed

+50
-6
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "trainable-entity-extractor"
3-
version = "2025.4.24.1"
3+
version = "2025.4.24.2"
44
description = "This tool is a trainable text/PDF to entity extractor"
55
license = { file = "LICENSE" }
66
authors = [{ name = "HURIDOCS" }]

src/multilingual_paragraph_extractor/domain/ParagraphsFromLanguage.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ def remove_duplicated_text(self):
111111

112112
cleaned_paragraphs.append(paragraph)
113113

114-
cleaned_paragraphs.append(self.paragraphs[-1])
114+
if self.paragraphs:
115+
cleaned_paragraphs.append(self.paragraphs[-1])
116+
115117
return cleaned_paragraphs
116118

117119
def remove_headers_and_footers(self):
@@ -128,7 +130,7 @@ def is_top_or_bottom_of_page(paragraph: ParagraphFeatures, page_height: int):
128130

129131
def find_headers_with_similarities(self):
130132
paragraphs_on_top = [x for x in self.paragraphs if self.is_top_or_bottom_of_page(x, self.paragraphs[0].page_height)]
131-
pages_number = max([x.page_number for x in self.paragraphs])
133+
pages_number = max([x.page_number for x in self.paragraphs]) if self.paragraphs else 1
132134
headers = {}
133135
for paragraph in paragraphs_on_top:
134136
found_match = False
@@ -259,7 +261,7 @@ def set_alignment_scores(self):
259261
unmatched_2 = set(range(len(self.paragraphs)))
260262

261263
indexes_matching: dict[int, int] = dict()
262-
scores: dict[(ParagraphFeatures, ParagraphFeatures), float] = dict()
264+
scores: dict[tuple[ParagraphFeatures, ParagraphFeatures], float] = dict()
263265

264266
for threshold in THRESHOLD:
265267
last_idx2_inserted = 0
@@ -313,6 +315,8 @@ def set_alignment_scores(self):
313315

314316
def is_same_pdf(self):
315317
paragraph_count = len(self._main_language_paragraphs)
318+
if not paragraph_count:
319+
return True
316320
unmatched_paragraphs = [x for x in self._main_language_paragraphs if x not in self._alignment_scores]
317321
match_percentage = 100 * (paragraph_count - len(unmatched_paragraphs)) / paragraph_count
318322
return 50 < match_percentage
@@ -462,7 +466,7 @@ def is_aligned(self, main_language: "ParagraphsFromLanguage") -> bool:
462466
return len(self._aligned_paragraphs) == len(main_language.paragraphs)
463467

464468
def remove_big_no_text_paragraphs(self):
465-
threshold_area = 0.2 * self.paragraphs[0].page_width * self.paragraphs[0].page_height
469+
threshold_area = 0.2 * self.paragraphs[0].page_width * self.paragraphs[0].page_height if self.paragraphs else 0
466470
fixed_paragraphs = list()
467471

468472
for paragraph in self.paragraphs:

src/multilingual_paragraph_extractor/tests/test_align_paragraphs.py

+40
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,46 @@ def test_align_paragraphs_when_no_main_language(self):
6464
self.assertEqual("English text", paragraphs_from_languages[0].paragraphs[0].text_cleaned)
6565
self.assertEqual("French text", paragraphs_from_languages[1].paragraphs[0].text_cleaned)
6666

67+
def test_align_paragraphs_when_no_paragraph_in_one_language(self):
68+
language_paragraph_1 = ParagraphsFromLanguage(language="en", paragraphs=[], is_main_language=True)
69+
70+
pdf_data_paragraphs_2 = ParagraphFeatures.from_texts(texts=["French text"])
71+
language_paragraph_2 = ParagraphsFromLanguage(
72+
language="fr", paragraphs=pdf_data_paragraphs_2, is_main_language=False
73+
)
74+
75+
multilingual_paragraph_extractor = MultilingualParagraphAlignerUseCase(
76+
extractor_identifier=self.extraction_identifier
77+
)
78+
paragraphs_from_languages = [language_paragraph_1, language_paragraph_2]
79+
multilingual_paragraph_extractor.align_languages(paragraphs_from_languages)
80+
81+
self.assertEqual(2, len(paragraphs_from_languages))
82+
self.assertEqual(0, len(paragraphs_from_languages[0].paragraphs))
83+
self.assertEqual(0, len(paragraphs_from_languages[1].paragraphs))
84+
85+
def test_align_paragraphs_when_no_paragraph_in_other_language(self):
86+
pdf_data_paragraphs_1 = ParagraphFeatures.from_texts(texts=["English text"])
87+
language_paragraph_1 = ParagraphsFromLanguage(language="en", paragraphs=pdf_data_paragraphs_1, is_main_language=True)
88+
89+
language_paragraph_2 = ParagraphsFromLanguage(language="fr", paragraphs=[], is_main_language=False)
90+
91+
multilingual_paragraph_extractor = MultilingualParagraphAlignerUseCase(
92+
extractor_identifier=self.extraction_identifier
93+
)
94+
paragraphs_from_languages = [language_paragraph_1, language_paragraph_2]
95+
multilingual_paragraph_extractor.align_languages(paragraphs_from_languages)
96+
97+
self.assertEqual(2, len(paragraphs_from_languages))
98+
self.assertEqual(1, len(paragraphs_from_languages[0].paragraphs))
99+
self.assertEqual(1, len(paragraphs_from_languages[1].paragraphs))
100+
101+
self.assertEqual("en", paragraphs_from_languages[0].language)
102+
self.assertEqual("fr", paragraphs_from_languages[1].language)
103+
104+
self.assertEqual("English text", paragraphs_from_languages[0].paragraphs[0].text_cleaned)
105+
self.assertEqual("", paragraphs_from_languages[1].paragraphs[0].text_cleaned)
106+
67107
@staticmethod
68108
def get_paragraphs(language: str):
69109
paragraphs = ParagraphFeatures.from_texts(texts=[f"a 0. {language}", f"b 1: {language}", f"c 2! {language}"])

src/multilingual_paragraph_extractor/use_cases/MultilingualParagraphAlignerUseCase.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def __init__(self, extractor_identifier: ExtractionIdentifier):
88

99
def align_languages(self, paragraphs_from_languages: list[ParagraphsFromLanguage]):
1010
if not paragraphs_from_languages:
11-
return []
11+
return
1212

1313
self.clean_paragraphs(paragraphs_from_languages)
1414
main_language, other_languages = self.get_main_and_other_languages(paragraphs_from_languages)

0 commit comments

Comments
 (0)