Skip to content

Commit d06730b

Browse files
committed
Add script to get pdf data from marker
1 parent 5f4c2c2 commit d06730b

File tree

1 file changed

+181
-8
lines changed

1 file changed

+181
-8
lines changed

src/multilingual_paragraph_extractor/driver/create_pdf_data_from_marker.py

+181-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,154 @@
1+
import json
12
import pickle
3+
from enum import StrEnum
24
from pathlib import Path
35

6+
from bs4 import BeautifulSoup
7+
from pdf_features.PdfFeatures import PdfFeatures
8+
from pdf_features.PdfToken import PdfToken
9+
from pdf_features.Rectangle import Rectangle
10+
from pdf_token_type_labels.TokenType import TokenType
11+
from pydantic import BaseModel, ConfigDict
12+
413
from multilingual_paragraph_extractor.driver.label_data import get_paths, EXTRACTION_IDENTIFIER, PARAGRAPH_EXTRACTION_PATH
514
from trainable_entity_extractor.domain.PdfData import PdfData
15+
from trainable_entity_extractor.domain.SegmentBox import SegmentBox
616
from trainable_entity_extractor.domain.SegmentationData import SegmentationData
717
from trainable_entity_extractor.use_cases.XmlFile import XmlFile
818

919
SEGMENTATION_DATA_PATH = Path(PARAGRAPH_EXTRACTION_PATH, "segmentation_data")
20+
MARKER_JSONS_PATH = Path("marker/jsons/path")
21+
22+
23+
class MarkerTokenType(StrEnum):
24+
Line = "Line"
25+
Span = "Span"
26+
FigureGroup = "FigureGroup"
27+
TableGroup = "TableGroup"
28+
ListGroup = "ListGroup"
29+
PictureGroup = "PictureGroup"
30+
Page = "Page"
31+
Caption = "Caption"
32+
Code = "Code"
33+
Figure = "Figure"
34+
Footnote = "Footnote"
35+
Form = "Form"
36+
Equation = "Equation"
37+
Handwriting = "Handwriting"
38+
TextInlineMath = "TextInlineMath"
39+
ListItem = "ListItem"
40+
PageFooter = "PageFooter"
41+
PageHeader = "PageHeader"
42+
Picture = "Picture"
43+
SectionHeader = "SectionHeader"
44+
Table = "Table"
45+
Text = "Text"
46+
TableOfContents = "TableOfContents"
47+
Document = "Document"
48+
ComplexRegion = "ComplexRegion"
49+
TableCell = "TableCell"
50+
Reference = "Reference"
51+
52+
53+
MARKER_TYPE_TO_TOKEN_TYPE = {
54+
MarkerTokenType.Line: TokenType.TEXT,
55+
MarkerTokenType.Span: TokenType.TEXT,
56+
MarkerTokenType.FigureGroup: TokenType.PICTURE,
57+
MarkerTokenType.TableGroup: TokenType.TABLE,
58+
MarkerTokenType.ListGroup: TokenType.LIST_ITEM,
59+
MarkerTokenType.PictureGroup: TokenType.PICTURE,
60+
MarkerTokenType.Page: TokenType.TEXT,
61+
MarkerTokenType.Caption: TokenType.CAPTION,
62+
MarkerTokenType.Code: TokenType.TEXT,
63+
MarkerTokenType.Figure: TokenType.PICTURE,
64+
MarkerTokenType.Footnote: TokenType.FOOTNOTE,
65+
MarkerTokenType.Form: TokenType.TEXT,
66+
MarkerTokenType.Equation: TokenType.FORMULA,
67+
MarkerTokenType.Handwriting: TokenType.TEXT,
68+
MarkerTokenType.TextInlineMath: TokenType.TEXT,
69+
MarkerTokenType.ListItem: TokenType.LIST_ITEM,
70+
MarkerTokenType.PageFooter: TokenType.PAGE_FOOTER,
71+
MarkerTokenType.PageHeader: TokenType.PAGE_HEADER,
72+
MarkerTokenType.Picture: TokenType.PICTURE,
73+
MarkerTokenType.SectionHeader: TokenType.SECTION_HEADER,
74+
MarkerTokenType.Table: TokenType.TABLE,
75+
MarkerTokenType.Text: TokenType.TEXT,
76+
MarkerTokenType.TableOfContents: TokenType.TABLE,
77+
MarkerTokenType.Document: TokenType.TEXT,
78+
MarkerTokenType.ComplexRegion: TokenType.TEXT,
79+
MarkerTokenType.TableCell: TokenType.TABLE,
80+
MarkerTokenType.Reference: TokenType.TEXT,
81+
}
82+
83+
84+
class MarkerLabel(BaseModel):
85+
model_config = ConfigDict(arbitrary_types_allowed=True)
86+
87+
def __hash__(self):
88+
return hash((self.text, self.html, self.segment_type, self.bounding_box))
89+
90+
text: str
91+
html: str
92+
segment_type: MarkerTokenType
93+
bounding_box: Rectangle
94+
95+
@staticmethod
96+
def polygon_to_rectangle(polygon: list[list[float], list[float], list[float], list[float]]) -> Rectangle:
97+
x_min = int(polygon[0][0])
98+
y_min = int(polygon[0][1])
99+
x_max = int(polygon[1][0])
100+
y_max = int(polygon[2][1])
101+
return Rectangle.from_coordinates(left=x_min, top=y_min, right=x_max, bottom=y_max)
102+
103+
@staticmethod
104+
def extract_html_content(html_string: str):
105+
soup = BeautifulSoup(html_string, "html.parser")
106+
clean_text = soup.get_text()
107+
return clean_text
108+
109+
@staticmethod
110+
def from_label_json(label_json: dict) -> "MarkerLabel":
111+
segment_type = MarkerTokenType(label_json["block_type"])
112+
html = label_json["html"]
113+
text = MarkerLabel.extract_html_content(label_json["html"])
114+
bounding_box = MarkerLabel.polygon_to_rectangle(label_json["polygon"])
115+
return MarkerLabel(text=text, html=html, segment_type=segment_type, bounding_box=bounding_box)
116+
117+
118+
class MarkerPage(BaseModel):
119+
model_config = ConfigDict(arbitrary_types_allowed=True)
120+
page_number: int
121+
page_width: int
122+
page_height: int
123+
page_labels: list[MarkerLabel]
124+
125+
@staticmethod
126+
def extract_page_number(page_id: str) -> int:
127+
try:
128+
return int(page_id.split("/")[2]) + 1
129+
except (IndexError, ValueError):
130+
return 0
131+
132+
@staticmethod
133+
def from_page_json(page_json: dict) -> "MarkerPage":
134+
page_number = MarkerPage.extract_page_number(page_json["id"])
135+
width = int(page_json["bbox"][2])
136+
height = int(page_json["bbox"][3])
137+
page_labels: list[MarkerLabel] = [MarkerLabel.from_label_json(label) for label in page_json["children"]]
138+
return MarkerPage(page_number=page_number, page_width=width, page_height=height, page_labels=page_labels)
139+
140+
141+
class MarkerDocument(BaseModel):
142+
model_config = ConfigDict(arbitrary_types_allowed=True)
143+
144+
json_path: Path
145+
pages: list[MarkerPage]
146+
147+
@staticmethod
148+
def from_json_path(json_path: Path) -> "MarkerDocument":
149+
json_data: dict = json.load(json_path.open())
150+
pages: list[MarkerPage] = [MarkerPage.from_page_json(page_json=page_json) for page_json in json_data["children"]]
151+
return MarkerDocument(json_path=json_path, pages=pages)
10152

11153

12154
def save_pdfs_data():
@@ -21,15 +163,46 @@ def save_pdfs_data():
21163
pickle.dump(pdf_data, f)
22164

23165

166+
def get_xml_segment_boxes(marker_document: MarkerDocument):
167+
xml_segment_boxes: list[SegmentBox] = []
168+
for page in marker_document.pages:
169+
for label in page.page_labels:
170+
xml_segment_boxes.append(
171+
SegmentBox(
172+
left=label.bounding_box.left,
173+
top=label.bounding_box.top,
174+
width=label.bounding_box.width,
175+
height=label.bounding_box.height,
176+
page_number=page.page_number,
177+
page_width=page.page_width,
178+
page_height=page.page_height,
179+
segment_type=MARKER_TYPE_TO_TOKEN_TYPE[label.segment_type],
180+
)
181+
)
182+
return xml_segment_boxes
183+
184+
185+
def remove_no_token_marker_labels(marker_labels: list[MarkerLabel], pdf_tokens: list[PdfToken]):
186+
labels_to_keep = []
187+
for pdf_token in pdf_tokens:
188+
for marker_label in marker_labels:
189+
if marker_label.bounding_box.get_intersection_percentage(pdf_token.bounding_box):
190+
labels_to_keep.append(marker_label)
191+
break
192+
return [marker_label for marker_label in marker_labels if marker_label in labels_to_keep]
193+
194+
24195
def get_segmentation_data(pdf_path) -> SegmentationData:
25-
pdf_name = pdf_path.name.replace(".pdf", ".picke")
26-
segmentation_data_pickle = Path(SEGMENTATION_DATA_PATH, pdf_name)
27-
xml_segments_boxes = []
28-
if segmentation_data_pickle.exists():
29-
with open(segmentation_data_pickle, "rb") as f:
30-
segmentation_data = pickle.load(f)
31-
xml_segments_boxes = segmentation_data.xml_segments_boxes
32-
return SegmentationData(page_width=0, page_height=0, xml_segments_boxes=xml_segments_boxes)
196+
pdf_name = pdf_path.name.replace(".pdf", "")
197+
json_path = Path(MARKER_JSONS_PATH, pdf_name, pdf_name + ".json")
198+
marker_document: MarkerDocument = MarkerDocument.from_json_path(json_path=json_path)
199+
pdf_features = PdfFeatures.from_pdf_path(pdf_path=pdf_path)
200+
for page in pdf_features.pages:
201+
marker_page = [p for p in marker_document.pages if p.page_number == page.page_number][0]
202+
marker_page.page_labels = remove_no_token_marker_labels(marker_page.page_labels, page.tokens)
203+
204+
xml_segments_boxes = get_xml_segment_boxes(marker_document)
205+
return SegmentationData(page_width=0, page_height=0, xml_segments_boxes=xml_segments_boxes, label_segments_boxes=[])
33206

34207

35208
def get_pdf_data(pdf_name: str):

0 commit comments

Comments
 (0)