Skip to content

Commit 5f4c2c2

Browse files
committed
Add script to benchmark Marker
1 parent 4a2ed1f commit 5f4c2c2

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

src/multilingual_paragraph_extractor/driver/alignment_benchmark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def get_alignment_benchmark(model_name: str, show_mistakes: bool = True, file_fi
156156

157157
if __name__ == "__main__":
158158
model_name = "vgt"
159-
show_mistakes = False
159+
show_mistakes = True
160160
# file_filter = ["ohchr_1_en_ru"]
161161
file_filter = []
162162
get_alignment_benchmark(model_name, show_mistakes, file_filter)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import pickle
2+
from pathlib import Path
3+
4+
from multilingual_paragraph_extractor.driver.label_data import get_paths, EXTRACTION_IDENTIFIER, PARAGRAPH_EXTRACTION_PATH
5+
from trainable_entity_extractor.domain.PdfData import PdfData
6+
from trainable_entity_extractor.domain.SegmentationData import SegmentationData
7+
from trainable_entity_extractor.use_cases.XmlFile import XmlFile
8+
9+
SEGMENTATION_DATA_PATH = Path(PARAGRAPH_EXTRACTION_PATH, "segmentation_data")
10+
11+
12+
def save_pdfs_data():
13+
for xml_path in Path(PARAGRAPH_EXTRACTION_PATH, "xmls").iterdir():
14+
pdf_data_pickle = Path(PARAGRAPH_EXTRACTION_PATH, "pdf_data", xml_path.name.replace(".xml", ".pickle"))
15+
if pdf_data_pickle.exists():
16+
continue
17+
18+
pdf_data = get_pdf_data(xml_path.name.replace(".xml", ""))
19+
20+
with open(pdf_data_pickle, "wb") as f:
21+
pickle.dump(pdf_data, f)
22+
23+
24+
def get_segmentation_data(pdf_path) -> SegmentationData:
25+
pdf_name = pdf_path.name.replace(".pdf", ".picke")
26+
segmentation_data_pickle = Path(SEGMENTATION_DATA_PATH, pdf_name)
27+
xml_segments_boxes = []
28+
if segmentation_data_pickle.exists():
29+
with open(segmentation_data_pickle, "rb") as f:
30+
segmentation_data = pickle.load(f)
31+
xml_segments_boxes = segmentation_data.xml_segments_boxes
32+
return SegmentationData(page_width=0, page_height=0, xml_segments_boxes=xml_segments_boxes)
33+
34+
35+
def get_pdf_data(pdf_name: str):
36+
pdf_path, xml_path = get_paths(pdf_name)
37+
38+
with open(xml_path, "rb") as file:
39+
xml_file = XmlFile(extraction_identifier=EXTRACTION_IDENTIFIER, to_train=True, xml_file_name=xml_path.name)
40+
xml_file.save(file_content=file.read())
41+
42+
segmentation_data: SegmentationData = get_segmentation_data(pdf_path)
43+
pdf_data = PdfData.from_xml_file(xml_file=xml_file, segmentation_data=segmentation_data)
44+
return pdf_data
45+
46+
47+
if __name__ == "__main__":
48+
save_pdfs_data()

0 commit comments

Comments
 (0)