Skip to content

Commit 7ad538a

Browse files
committed
Fix numpy array concatenation
1 parent c3e15c7 commit 7ad538a

File tree

9 files changed

+18
-17
lines changed

9 files changed

+18
-17
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "trainable-entity-extractor"
3-
version = "2025.03.10"
3+
version = "2025.03.11"
44
description = "This tool is a trainable text/PDF to entity extractor"
55
license = { file = "LICENSE" }
66
authors = [{ name = "HURIDOCS" }]

src/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/filter_segments_methods/Beginning750.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from copy import deepcopy
2+
from typing import Optional
23

34
from trainable_entity_extractor.domain.PdfDataSegment import PdfDataSegment
45
from trainable_entity_extractor.use_cases.extractors.pdf_to_multi_option_extractor.FilterSegmentsMethod import (
@@ -22,7 +23,7 @@ def get_first_tokens(self, pdf_data_segments: list[PdfDataSegment], text_length:
2223
return filtered_segments
2324

2425
@staticmethod
25-
def get_segment(pdf_data_segment: PdfDataSegment, character_limit: int):
26+
def get_segment(pdf_data_segment: PdfDataSegment, character_limit: int) -> Optional[PdfDataSegment]:
2627
if character_limit <= 0:
2728
return None
2829

src/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/filter_segments_methods/CleanEndDot500.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def get_tokens(self, pdf_data_segments: list[PdfDataSegment], text_length: int)
1818
if not pdf_data_segment_copy:
1919
break
2020

21-
if pdf_data_segment_copy.text_content and "." == pdf_data_segment.text_cleaned[-1]:
21+
if pdf_data_segment_copy.text_content and "." == pdf_data_segment.text_content[-1]:
2222
pdf_data_segment_copy.text_content += "."
2323

2424
total_text += " " + pdf_data_segment_copy.text_content

src/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/filter_segments_methods/End750.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def get_first_tokens(self, pdf_data_segments: list[PdfDataSegment], text_length:
1414
if not pdf_data_segment_copy:
1515
break
1616

17-
total_text += " " + pdf_data_segment_copy.text_cleaned
17+
total_text += " " + pdf_data_segment_copy.text_content
1818
filtered_segments.append(pdf_data_segment_copy)
1919

2020
return list(reversed(filtered_segments))

src/trainable_entity_extractor/use_cases/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def get_sentence_segment_list(self, pdf_data_segments) -> list[(str, PdfDataSegm
4242

4343
sentence_segment_list = []
4444
for segment in merged_sentences:
45-
segment_text = " ".join(segment.text_cleaned.split())
45+
segment_text = " ".join(segment.text_content.split())
4646
for text in re.split(r"\.|:", segment_text):
4747
if not text:
4848
continue
@@ -65,17 +65,17 @@ def get_sentence_segment_list(self, pdf_data_segments) -> list[(str, PdfDataSegm
6565
return sentences_across_pages
6666

6767
def get_segments_merged(self, segments):
68-
segments = [segment for segment in segments if segment.text_cleaned.strip()]
68+
segments = [segment for segment in segments if segment.text_content.strip()]
6969
if not segments:
7070
return list()
7171

7272
merged_sentences = [segments[0]]
7373
for segment in segments[1:]:
74-
previous_segment_text = " ".join(merged_sentences[-1].text_cleaned.split())
74+
previous_segment_text = " ".join(merged_sentences[-1].text_content.split())
7575

7676
if previous_segment_text[-1] not in [".", ":"]:
7777
merged_segment = deepcopy(merged_sentences[-1])
78-
merged_segment.text_cleaned = f"{previous_segment_text}, {' '.join(segment.text_cleaned.split())}"
78+
merged_segment.text_content = f"{previous_segment_text}, {' '.join(segment.text_content.split())}"
7979
bounding_boxes = [merged_segment.bounding_box, segment.bounding_box]
8080
merged_segment.bounding_box = Rectangle.merge_rectangles(bounding_boxes)
8181
merged_sentences[-1] = merged_segment
@@ -89,7 +89,7 @@ def get_sample(sample: TrainingSample, sentence_segment_list: list[(str, PdfData
8989
sentence_segments = list()
9090
for sentence, segment in sentence_segment_list:
9191
sentence_segment = deepcopy(segment)
92-
sentence_segment.text_cleaned = sentence
92+
sentence_segment.text_content = sentence
9393
sentence_segments.append(sentence_segment)
9494

9595
sentence_pdf_data = PdfData(pdf_features=None, file_name=sample.pdf_data.file_name)

src/trainable_entity_extractor/use_cases/extractors/pdf_to_text_extractor/methods/GlinerFirstDateMethod.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ def get_date_from_segments(self, segments: list[PdfDataSegment], languages):
1717
merge_segments: list[list[PdfDataSegment]] = self.merge_segments_for_dates(segments)
1818
for segments in merge_segments:
1919
segment_merged = PdfDataSegment.from_list_to_merge(segments)
20-
if not self.contains_year(segment_merged.text_cleaned):
20+
if not self.contains_year(segment_merged.text_content):
2121
continue
2222

23-
date = GlinerDateParserMethod.get_date([segment_merged.text_cleaned])
23+
date = GlinerDateParserMethod.get_date([segment_merged.text_content])
2424
if date:
2525
for segment in segments:
2626
segment.ml_label = 1

src/trainable_entity_extractor/use_cases/extractors/pdf_to_text_extractor/methods/SpaceFixerGlinerFirstDateMethod.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ def contains_year(text: str):
1515

1616
def get_date_from_segments(self, segments: list[PdfDataSegment], languages: list[str]) -> str:
1717
for segment in self.loop_segments(segments):
18-
if not self.contains_year(segment.text_cleaned):
18+
if not self.contains_year(segment.text_content):
1919
continue
20-
date = GlinerDateParserMethod.get_date([segment.text_cleaned])
20+
date = GlinerDateParserMethod.get_date([segment.text_content])
2121
if date:
2222
segment.ml_label = 1
2323
return date.strftime("%Y-%m-%d")

src/trainable_entity_extractor/use_cases/extractors/segment_selector/FastSegmentSelector.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ def predict(self, segments):
150150
return []
151151

152152
model = lgb.Booster(model_file=self.model_path)
153-
predictions = model.predict(x)
154-
153+
predictions_array = model.predict(x)
154+
predictions = list(predictions_array) if predictions_array is not None else []
155155
return self.predictions_scores_to_segments(segments, predictions)
156156

157157
def predictions_scores_to_segments(self, segments: list[PdfDataSegment], prediction_scores: list[float]):

src/trainable_entity_extractor/use_cases/extractors/segment_selector/methods/avoiding_words/AvoidingWords.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ def save_most_frequent_words(self, model_path):
109109

110110
avoiding_words = Counter()
111111
for segment in [segment for segment in self.segments if segment.pdf_segment.ml_label]:
112-
text_tokens = word_tokenize(segment.next_segment.text_cleaned) if segment.next_segment else []
113-
text_tokens += word_tokenize(segment.previous_segment.text_cleaned) if segment.previous_segment else []
112+
text_tokens = word_tokenize(segment.next_segment.text_content) if segment.next_segment else []
113+
text_tokens += word_tokenize(segment.previous_segment.text_content) if segment.previous_segment else []
114114
avoiding_words.update(
115115
[
116116
word.lower()

0 commit comments

Comments
 (0)