@@ -64,6 +64,46 @@ def test_align_paragraphs_when_no_main_language(self):
64
64
self .assertEqual ("English text" , paragraphs_from_languages [0 ].paragraphs [0 ].text_cleaned )
65
65
self .assertEqual ("French text" , paragraphs_from_languages [1 ].paragraphs [0 ].text_cleaned )
66
66
67
+ def test_align_paragraphs_when_no_paragraph_in_one_language (self ):
68
+ language_paragraph_1 = ParagraphsFromLanguage (language = "en" , paragraphs = [], is_main_language = True )
69
+
70
+ pdf_data_paragraphs_2 = ParagraphFeatures .from_texts (texts = ["French text" ])
71
+ language_paragraph_2 = ParagraphsFromLanguage (
72
+ language = "fr" , paragraphs = pdf_data_paragraphs_2 , is_main_language = False
73
+ )
74
+
75
+ multilingual_paragraph_extractor = MultilingualParagraphAlignerUseCase (
76
+ extractor_identifier = self .extraction_identifier
77
+ )
78
+ paragraphs_from_languages = [language_paragraph_1 , language_paragraph_2 ]
79
+ multilingual_paragraph_extractor .align_languages (paragraphs_from_languages )
80
+
81
+ self .assertEqual (2 , len (paragraphs_from_languages ))
82
+ self .assertEqual (0 , len (paragraphs_from_languages [0 ].paragraphs ))
83
+ self .assertEqual (0 , len (paragraphs_from_languages [1 ].paragraphs ))
84
+
85
+ def test_align_paragraphs_when_no_paragraph_in_other_language (self ):
86
+ pdf_data_paragraphs_1 = ParagraphFeatures .from_texts (texts = ["English text" ])
87
+ language_paragraph_1 = ParagraphsFromLanguage (language = "en" , paragraphs = pdf_data_paragraphs_1 , is_main_language = True )
88
+
89
+ language_paragraph_2 = ParagraphsFromLanguage (language = "fr" , paragraphs = [], is_main_language = False )
90
+
91
+ multilingual_paragraph_extractor = MultilingualParagraphAlignerUseCase (
92
+ extractor_identifier = self .extraction_identifier
93
+ )
94
+ paragraphs_from_languages = [language_paragraph_1 , language_paragraph_2 ]
95
+ multilingual_paragraph_extractor .align_languages (paragraphs_from_languages )
96
+
97
+ self .assertEqual (2 , len (paragraphs_from_languages ))
98
+ self .assertEqual (1 , len (paragraphs_from_languages [0 ].paragraphs ))
99
+ self .assertEqual (1 , len (paragraphs_from_languages [1 ].paragraphs ))
100
+
101
+ self .assertEqual ("en" , paragraphs_from_languages [0 ].language )
102
+ self .assertEqual ("fr" , paragraphs_from_languages [1 ].language )
103
+
104
+ self .assertEqual ("English text" , paragraphs_from_languages [0 ].paragraphs [0 ].text_cleaned )
105
+ self .assertEqual ("" , paragraphs_from_languages [1 ].paragraphs [0 ].text_cleaned )
106
+
67
107
@staticmethod
68
108
def get_paragraphs (language : str ):
69
109
paragraphs = ParagraphFeatures .from_texts (texts = [f"a 0. { language } " , f"b 1: { language } " , f"c 2! { language } " ])
0 commit comments