Skip to content

Commit c3e15c7

Browse files
committed
Add to db methods
1 parent 3b1d717 commit c3e15c7

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "trainable-entity-extractor"
3-
version = "2025.02.14.07"
3+
version = "2025.03.10"
44
description = "This tool is a trainable text/PDF to entity extractor"
55
license = { file = "LICENSE" }
66
authors = [{ name = "HURIDOCS" }]

src/multilingual_paragraph_extractor/domain/ParagraphFeatures.py

+12
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,15 @@ def get_distance(self, next_paragraph: "ParagraphFeatures") -> float:
256256
return 0
257257

258258
return (next_paragraph.first_token_bounding_box.top - self.last_token_bounding_box.bottom) / self.page_height
259+
260+
def to_db(self):
261+
return ParagraphFeatures(
262+
index=self.index,
263+
page_height=self.page_height,
264+
page_width=self.page_width,
265+
paragraph_type=self.paragraph_type,
266+
page_number=self.page_number,
267+
bounding_box=self.bounding_box,
268+
text_cleaned=self.text_cleaned,
269+
original_text=self.original_text,
270+
)

src/multilingual_paragraph_extractor/domain/ParagraphsFromLanguage.py

+7
Original file line numberDiff line numberDiff line change
@@ -432,3 +432,10 @@ def remove_big_no_text_paragraphs(self):
432432
fixed_paragraphs.append(paragraph)
433433

434434
self.paragraphs = fixed_paragraphs
435+
436+
def to_db(self):
437+
return ParagraphsFromLanguage(
438+
language=self.language,
439+
paragraphs=[x.to_db() for x in self.paragraphs],
440+
is_main_language=self.is_main_language,
441+
)

0 commit comments

Comments
 (0)