Skip to content

Commit b1f4044

Browse files
committed
Use pydantic for training sample and prediction sample
1 parent f80c24e commit b1f4044

File tree

13 files changed

+97
-88
lines changed

13 files changed

+97
-88
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "trainable-entity-extractor"
3-
version = "2025.03.20.04"
3+
version = "2025.03.20.05"
44
description = "This tool is a trainable text/PDF to entity extractor"
55
license = { file = "LICENSE" }
66
authors = [{ name = "HURIDOCS" }]

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_labels_methods/test_can_be_used.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,22 @@ def setUp(self):
4949
pdf_data_8 = PdfData.from_texts(["point 8"])
5050

5151
samples = [
52-
TrainingSample(pdf_data_1, LabeledData(values=[self.options[0]], language_iso="en")),
53-
TrainingSample(pdf_data_2, LabeledData(values=[self.options[1], self.options[2]], language_iso="es")),
54-
TrainingSample(pdf_data_3, LabeledData(values=[self.options[2], self.options[3]], language_iso="en")),
55-
TrainingSample(pdf_data_4, LabeledData(values=[self.options[3]], language_iso="fr")),
56-
TrainingSample(pdf_data_5, LabeledData(values=[self.options[4], self.options[0]], language_iso="en")),
57-
TrainingSample(pdf_data_6, LabeledData(values=[self.options[5]], language_iso="en")),
58-
TrainingSample(pdf_data_7, LabeledData(values=[self.options[6], self.options[5]], language_iso="ru")),
59-
TrainingSample(pdf_data_8, LabeledData(values=[self.options[7]], language_iso="ru")),
52+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[self.options[0]], language_iso="en")),
53+
TrainingSample(
54+
pdf_data=pdf_data_2, labeled_data=LabeledData(values=[self.options[1], self.options[2]], language_iso="es")
55+
),
56+
TrainingSample(
57+
pdf_data=pdf_data_3, labeled_data=LabeledData(values=[self.options[2], self.options[3]], language_iso="en")
58+
),
59+
TrainingSample(pdf_data=pdf_data_4, labeled_data=LabeledData(values=[self.options[3]], language_iso="fr")),
60+
TrainingSample(
61+
pdf_data=pdf_data_5, labeled_data=LabeledData(values=[self.options[4], self.options[0]], language_iso="en")
62+
),
63+
TrainingSample(pdf_data=pdf_data_6, labeled_data=LabeledData(values=[self.options[5]], language_iso="en")),
64+
TrainingSample(
65+
pdf_data=pdf_data_7, labeled_data=LabeledData(values=[self.options[6], self.options[5]], language_iso="ru")
66+
),
67+
TrainingSample(pdf_data=pdf_data_8, labeled_data=LabeledData(values=[self.options[7]], language_iso="ru")),
6068
]
6169

6270
self.extraction_data_english_multi = ExtractionData(

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_labels_methods/test_setfit_english.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ def test_train_and_predict(self):
4444
pdf_data_5 = PdfData.from_texts(["point 5"])
4545

4646
samples = [
47-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
48-
TrainingSample(pdf_data_2, LabeledData(values=[options[1], options[2]])),
49-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
50-
TrainingSample(pdf_data_4, LabeledData(values=[options[3], options[0]])),
51-
TrainingSample(pdf_data_5, LabeledData(values=[options[4]])),
47+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
48+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1], options[2]])),
49+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
50+
TrainingSample(pdf_data=pdf_data_4, labeled_data=LabeledData(values=[options[3], options[0]])),
51+
TrainingSample(pdf_data=pdf_data_5, labeled_data=LabeledData(values=[options[4]])),
5252
]
5353

5454
samples = samples * 2

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_labels_methods/test_setfit_multilingual.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ def test_train_and_predict(self):
4444
pdf_data_5 = PdfData.from_texts(["point 5"])
4545

4646
samples = [
47-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
48-
TrainingSample(pdf_data_2, LabeledData(values=[options[1], options[2]])),
49-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
50-
TrainingSample(pdf_data_4, LabeledData(values=[options[3], options[0]])),
51-
TrainingSample(pdf_data_5, LabeledData(values=[options[4]])),
47+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
48+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1], options[2]])),
49+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
50+
TrainingSample(pdf_data=pdf_data_4, labeled_data=LabeledData(values=[options[3], options[0]])),
51+
TrainingSample(pdf_data=pdf_data_5, labeled_data=LabeledData(values=[options[4]])),
5252
]
5353

5454
extraction_data = ExtractionData(

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_labels_methods/test_single_label_setfit_english.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ def test_train_and_predict(self):
3636
pdf_data_3 = PdfData.from_texts(["point 3"])
3737

3838
samples = [
39-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
40-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
41-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
39+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
40+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
41+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
4242
]
4343

4444
extraction_data = ExtractionData(

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_labels_methods/test_single_label_setfit_multilingual.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ def test_train_and_predict(self):
3636
pdf_data_3 = PdfData.from_texts(["point 3"])
3737

3838
samples = [
39-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
40-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
41-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
39+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
40+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
41+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
4242
]
4343

4444
extraction_data = ExtractionData(

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/test_fast_segment_selector_fuzzy_commas.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ def test_performance_100(self):
2828
pdf_data_4 = PdfData.from_texts(["2, 3"])
2929

3030
samples = [
31-
TrainingSample(pdf_data_1, LabeledData(values=[options[0], options[1]])),
32-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
33-
TrainingSample(pdf_data_3, LabeledData(values=[options[2], options[0]])),
34-
TrainingSample(pdf_data_4, LabeledData(values=[options[1], options[2]])),
31+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0], options[1]])),
32+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
33+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2], options[0]])),
34+
TrainingSample(pdf_data=pdf_data_4, labeled_data=LabeledData(values=[options[1], options[2]])),
3535
]
3636

3737
multi_option_data = ExtractionData(
@@ -52,10 +52,10 @@ def test_performance_83(self):
5252
pdf_data_4 = PdfData.from_texts(["4"])
5353

5454
samples = [
55-
TrainingSample(pdf_data_1, LabeledData(values=[options[0], options[1]])),
56-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
57-
TrainingSample(pdf_data_3, LabeledData(values=[options[2], options[0]])),
58-
TrainingSample(pdf_data_4, LabeledData(values=[options[1], options[2]])),
55+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0], options[1]])),
56+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
57+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2], options[0]])),
58+
TrainingSample(pdf_data=pdf_data_4, labeled_data=LabeledData(values=[options[1], options[2]])),
5959
]
6060

6161
multi_option_data = ExtractionData(
@@ -82,10 +82,10 @@ def test_predictions(self):
8282
pdf_data_5 = PdfData.from_texts(["4, 3"])
8383

8484
samples = [
85-
TrainingSample(pdf_data_1, LabeledData(values=[options[0], options[1]])),
86-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
87-
TrainingSample(pdf_data_3, LabeledData(values=[options[2], options[0]])),
88-
TrainingSample(pdf_data_4, LabeledData(values=[options[1], options[2]])),
85+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0], options[1]])),
86+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
87+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2], options[0]])),
88+
TrainingSample(pdf_data=pdf_data_4, labeled_data=LabeledData(values=[options[1], options[2]])),
8989
]
9090

9191
multi_option_data = ExtractionData(
@@ -96,8 +96,8 @@ def test_predictions(self):
9696
fast_segment_selector_fuzzy_commas.train(multi_option_data)
9797

9898
prediction_samples = [
99-
TrainingSample(pdf_data_1, LabeledData(values=[])),
100-
TrainingSample(pdf_data_5, LabeledData(values=[])),
99+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[])),
100+
TrainingSample(pdf_data=pdf_data_5, labeled_data=LabeledData(values=[])),
101101
]
102102
prediction_multi_option_data = ExtractionData(
103103
multi_value=True, options=options, samples=prediction_samples, extraction_identifier=self.extraction_identifier
@@ -122,9 +122,9 @@ def test_predictions_when_empy_data(self):
122122
pdf_data_3 = PdfData.from_texts(["3, 1"])
123123

124124
samples = [
125-
TrainingSample(pdf_data_1, LabeledData(values=[options[0], options[1]])),
126-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
127-
TrainingSample(pdf_data_3, LabeledData(values=[options[2], options[0]])),
125+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0], options[1]])),
126+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
127+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2], options[0]])),
128128
]
129129

130130
multi_option_data = ExtractionData(
@@ -135,10 +135,10 @@ def test_predictions_when_empy_data(self):
135135
fast_segment_selector_fuzzy_commas.train(multi_option_data)
136136

137137
prediction_samples = [
138-
TrainingSample(pdf_data_1, LabeledData(values=[])),
139-
TrainingSample(PdfData(), LabeledData(values=[])),
140-
TrainingSample(PdfData.from_texts([]), LabeledData(values=[])),
141-
TrainingSample(PdfData.from_texts([""]), LabeledData(values=[])),
138+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[])),
139+
TrainingSample(pdf_data=PdfData(), labeled_data=LabeledData(values=[])),
140+
TrainingSample(pdf_data=PdfData.from_texts([]), labeled_data=LabeledData(values=[])),
141+
TrainingSample(pdf_data=PdfData.from_texts([""]), labeled_data=LabeledData(values=[])),
142142
]
143143
prediction_multi_option_data = ExtractionData(
144144
multi_value=True, options=options, samples=prediction_samples, extraction_identifier=self.extraction_identifier

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/test_filter_segments.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ def get_data(self) -> ExtractionData:
3535
pdf_data_3 = PdfData.from_texts(["point 3", "point 3", "point 3"])
3636

3737
samples = [
38-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
39-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
40-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
38+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
39+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
40+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
4141
]
4242

4343
multi_option_data = ExtractionData(
@@ -53,7 +53,7 @@ def get_data_for_context(self) -> ExtractionData:
5353
pdf_data = PdfData.from_texts(["point 1"] * 3000)
5454

5555
samples = [
56-
TrainingSample(pdf_data, LabeledData(values=[options[0]])),
56+
TrainingSample(pdf_data=pdf_data, labeled_data=LabeledData(values=[options[0]])),
5757
]
5858

5959
multi_option_data = ExtractionData(
@@ -69,7 +69,7 @@ def get_empty_segments(self):
6969
pdf_data_1 = PdfData.from_texts([""])
7070

7171
samples = [
72-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
72+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
7373
]
7474

7575
multi_option_data = ExtractionData(
@@ -84,7 +84,7 @@ def get_no_segments(self):
8484
pdf_data_1 = PdfData()
8585

8686
samples = [
87-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
87+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
8888
]
8989

9090
multi_option_data = ExtractionData(

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/test_fuzzy_methods.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ def test_fuzzy_all_100(self):
3535
pdf_data_3 = PdfData.from_texts(["blah. item 10, item 1. blah"])
3636

3737
samples = [
38-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
39-
TrainingSample(pdf_data_2, LabeledData(values=[options[2]])),
40-
TrainingSample(pdf_data_3, LabeledData(values=[options[0], options[2]])),
38+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
39+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[2]])),
40+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[0], options[2]])),
4141
]
4242

4343
multi_option_data = ExtractionData(
@@ -61,8 +61,8 @@ def test_fuzzy_commas(self):
6161
pdf_data_2 = PdfData.from_texts(["blah, 10 item, item 1, blah"])
6262

6363
samples = [
64-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
65-
TrainingSample(pdf_data_2, LabeledData(values=[options[2]])),
64+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
65+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[2]])),
6666
]
6767

6868
multi_option_data = ExtractionData(
@@ -93,7 +93,7 @@ def test_fuzzy_commas_aliases(self):
9393
pdf_data_1.pdf_data_segments[0].ml_label = 1
9494

9595
samples = [
96-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
96+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
9797
]
9898

9999
multi_option_data = ExtractionData(
@@ -119,9 +119,9 @@ def test_fast_segment_selector_fuzzy_95(self):
119119
pdf_data_3 = PdfData.from_texts(["foo", "var", "mark 1", "item 10", text])
120120

121121
samples = [
122-
TrainingSample(pdf_data_1, LabeledData(values=[options[0], options[1], options[2]])),
123-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
124-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
122+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0], options[1], options[2]])),
123+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
124+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
125125
] * 5
126126

127127
multi_option_data = ExtractionData(
@@ -160,7 +160,7 @@ def test_fuzzy_all_75(self):
160160
pdf_data_1 = PdfData.from_texts(["blah. item 1. blah"])
161161

162162
samples = [
163-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
163+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
164164
]
165165

166166
multi_option_data = ExtractionData(
@@ -183,9 +183,9 @@ def test_fuzzy_first(self):
183183
pdf_data_3 = PdfData.from_texts(["blah. item 10, item 1. blah"])
184184

185185
samples = [
186-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
187-
TrainingSample(pdf_data_2, LabeledData(values=[options[2]])),
188-
TrainingSample(pdf_data_3, LabeledData(values=[options[0], options[2]])),
186+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
187+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[2]])),
188+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[0], options[2]])),
189189
]
190190

191191
multi_option_data = ExtractionData(

src/tests/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/test_pdf_to_multi_option_extraction.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ def test_single_value(self):
2424
pdf_data_3 = PdfData.from_texts(["point 3"])
2525

2626
samples = [
27-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
28-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
29-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
27+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
28+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
29+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
3030
]
3131

3232
multi_option_data = ExtractionData(
@@ -53,9 +53,9 @@ def test_multi_value(self):
5353
pdf_data_3 = PdfData.from_texts(["point 3 point 1"])
5454

5555
samples = [
56-
TrainingSample(pdf_data_1, LabeledData(values=[options[0], options[1]])),
57-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
58-
TrainingSample(pdf_data_3, LabeledData(values=[options[2], options[0]])),
56+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0], options[1]])),
57+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
58+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2], options[0]])),
5959
]
6060

6161
multi_option_data = ExtractionData(
@@ -86,9 +86,9 @@ def test_no_prediction_data(self):
8686
pdf_data_3 = PdfData.from_texts(["point three point one"])
8787

8888
samples = [
89-
TrainingSample(pdf_data_1, LabeledData(values=[options[0]])),
90-
TrainingSample(pdf_data_2, LabeledData(values=[options[1]])),
91-
TrainingSample(pdf_data_3, LabeledData(values=[options[2]])),
89+
TrainingSample(pdf_data=pdf_data_1, labeled_data=LabeledData(values=[options[0]])),
90+
TrainingSample(pdf_data=pdf_data_2, labeled_data=LabeledData(values=[options[1]])),
91+
TrainingSample(pdf_data=pdf_data_3, labeled_data=LabeledData(values=[options[2]])),
9292
]
9393

9494
multi_option_data = ExtractionData(

0 commit comments

Comments
 (0)