1
+ import json
1
2
import pickle
3
+ from enum import StrEnum
2
4
from pathlib import Path
3
5
6
+ from bs4 import BeautifulSoup
7
+ from pdf_features .PdfFeatures import PdfFeatures
8
+ from pdf_features .PdfToken import PdfToken
9
+ from pdf_features .Rectangle import Rectangle
10
+ from pdf_token_type_labels .TokenType import TokenType
11
+ from pydantic import BaseModel , ConfigDict
12
+
4
13
from multilingual_paragraph_extractor .driver .label_data import get_paths , EXTRACTION_IDENTIFIER , PARAGRAPH_EXTRACTION_PATH
5
14
from trainable_entity_extractor .domain .PdfData import PdfData
15
+ from trainable_entity_extractor .domain .SegmentBox import SegmentBox
6
16
from trainable_entity_extractor .domain .SegmentationData import SegmentationData
7
17
from trainable_entity_extractor .use_cases .XmlFile import XmlFile
8
18
9
19
SEGMENTATION_DATA_PATH = Path (PARAGRAPH_EXTRACTION_PATH , "segmentation_data" )
20
+ MARKER_JSONS_PATH = Path ("marker/jsons/path" )
21
+
22
+
23
+ class MarkerTokenType (StrEnum ):
24
+ Line = "Line"
25
+ Span = "Span"
26
+ FigureGroup = "FigureGroup"
27
+ TableGroup = "TableGroup"
28
+ ListGroup = "ListGroup"
29
+ PictureGroup = "PictureGroup"
30
+ Page = "Page"
31
+ Caption = "Caption"
32
+ Code = "Code"
33
+ Figure = "Figure"
34
+ Footnote = "Footnote"
35
+ Form = "Form"
36
+ Equation = "Equation"
37
+ Handwriting = "Handwriting"
38
+ TextInlineMath = "TextInlineMath"
39
+ ListItem = "ListItem"
40
+ PageFooter = "PageFooter"
41
+ PageHeader = "PageHeader"
42
+ Picture = "Picture"
43
+ SectionHeader = "SectionHeader"
44
+ Table = "Table"
45
+ Text = "Text"
46
+ TableOfContents = "TableOfContents"
47
+ Document = "Document"
48
+ ComplexRegion = "ComplexRegion"
49
+ TableCell = "TableCell"
50
+ Reference = "Reference"
51
+
52
+
53
+ MARKER_TYPE_TO_TOKEN_TYPE = {
54
+ MarkerTokenType .Line : TokenType .TEXT ,
55
+ MarkerTokenType .Span : TokenType .TEXT ,
56
+ MarkerTokenType .FigureGroup : TokenType .PICTURE ,
57
+ MarkerTokenType .TableGroup : TokenType .TABLE ,
58
+ MarkerTokenType .ListGroup : TokenType .LIST_ITEM ,
59
+ MarkerTokenType .PictureGroup : TokenType .PICTURE ,
60
+ MarkerTokenType .Page : TokenType .TEXT ,
61
+ MarkerTokenType .Caption : TokenType .CAPTION ,
62
+ MarkerTokenType .Code : TokenType .TEXT ,
63
+ MarkerTokenType .Figure : TokenType .PICTURE ,
64
+ MarkerTokenType .Footnote : TokenType .FOOTNOTE ,
65
+ MarkerTokenType .Form : TokenType .TEXT ,
66
+ MarkerTokenType .Equation : TokenType .FORMULA ,
67
+ MarkerTokenType .Handwriting : TokenType .TEXT ,
68
+ MarkerTokenType .TextInlineMath : TokenType .TEXT ,
69
+ MarkerTokenType .ListItem : TokenType .LIST_ITEM ,
70
+ MarkerTokenType .PageFooter : TokenType .PAGE_FOOTER ,
71
+ MarkerTokenType .PageHeader : TokenType .PAGE_HEADER ,
72
+ MarkerTokenType .Picture : TokenType .PICTURE ,
73
+ MarkerTokenType .SectionHeader : TokenType .SECTION_HEADER ,
74
+ MarkerTokenType .Table : TokenType .TABLE ,
75
+ MarkerTokenType .Text : TokenType .TEXT ,
76
+ MarkerTokenType .TableOfContents : TokenType .TABLE ,
77
+ MarkerTokenType .Document : TokenType .TEXT ,
78
+ MarkerTokenType .ComplexRegion : TokenType .TEXT ,
79
+ MarkerTokenType .TableCell : TokenType .TABLE ,
80
+ MarkerTokenType .Reference : TokenType .TEXT ,
81
+ }
82
+
83
+
84
+ class MarkerLabel (BaseModel ):
85
+ model_config = ConfigDict (arbitrary_types_allowed = True )
86
+
87
+ def __hash__ (self ):
88
+ return hash ((self .text , self .html , self .segment_type , self .bounding_box ))
89
+
90
+ text : str
91
+ html : str
92
+ segment_type : MarkerTokenType
93
+ bounding_box : Rectangle
94
+
95
+ @staticmethod
96
+ def polygon_to_rectangle (polygon : list [list [float ], list [float ], list [float ], list [float ]]) -> Rectangle :
97
+ x_min = int (polygon [0 ][0 ])
98
+ y_min = int (polygon [0 ][1 ])
99
+ x_max = int (polygon [1 ][0 ])
100
+ y_max = int (polygon [2 ][1 ])
101
+ return Rectangle .from_coordinates (left = x_min , top = y_min , right = x_max , bottom = y_max )
102
+
103
+ @staticmethod
104
+ def extract_html_content (html_string : str ):
105
+ soup = BeautifulSoup (html_string , "html.parser" )
106
+ clean_text = soup .get_text ()
107
+ return clean_text
108
+
109
+ @staticmethod
110
+ def from_label_json (label_json : dict ) -> "MarkerLabel" :
111
+ segment_type = MarkerTokenType (label_json ["block_type" ])
112
+ html = label_json ["html" ]
113
+ text = MarkerLabel .extract_html_content (label_json ["html" ])
114
+ bounding_box = MarkerLabel .polygon_to_rectangle (label_json ["polygon" ])
115
+ return MarkerLabel (text = text , html = html , segment_type = segment_type , bounding_box = bounding_box )
116
+
117
+
118
+ class MarkerPage (BaseModel ):
119
+ model_config = ConfigDict (arbitrary_types_allowed = True )
120
+ page_number : int
121
+ page_width : int
122
+ page_height : int
123
+ page_labels : list [MarkerLabel ]
124
+
125
+ @staticmethod
126
+ def extract_page_number (page_id : str ) -> int :
127
+ try :
128
+ return int (page_id .split ("/" )[2 ]) + 1
129
+ except (IndexError , ValueError ):
130
+ return 0
131
+
132
+ @staticmethod
133
+ def from_page_json (page_json : dict ) -> "MarkerPage" :
134
+ page_number = MarkerPage .extract_page_number (page_json ["id" ])
135
+ width = int (page_json ["bbox" ][2 ])
136
+ height = int (page_json ["bbox" ][3 ])
137
+ page_labels : list [MarkerLabel ] = [MarkerLabel .from_label_json (label ) for label in page_json ["children" ]]
138
+ return MarkerPage (page_number = page_number , page_width = width , page_height = height , page_labels = page_labels )
139
+
140
+
141
+ class MarkerDocument (BaseModel ):
142
+ model_config = ConfigDict (arbitrary_types_allowed = True )
143
+
144
+ json_path : Path
145
+ pages : list [MarkerPage ]
146
+
147
+ @staticmethod
148
+ def from_json_path (json_path : Path ) -> "MarkerDocument" :
149
+ json_data : dict = json .load (json_path .open ())
150
+ pages : list [MarkerPage ] = [MarkerPage .from_page_json (page_json = page_json ) for page_json in json_data ["children" ]]
151
+ return MarkerDocument (json_path = json_path , pages = pages )
10
152
11
153
12
154
def save_pdfs_data ():
@@ -21,15 +163,46 @@ def save_pdfs_data():
21
163
pickle .dump (pdf_data , f )
22
164
23
165
166
+ def get_xml_segment_boxes (marker_document : MarkerDocument ):
167
+ xml_segment_boxes : list [SegmentBox ] = []
168
+ for page in marker_document .pages :
169
+ for label in page .page_labels :
170
+ xml_segment_boxes .append (
171
+ SegmentBox (
172
+ left = label .bounding_box .left ,
173
+ top = label .bounding_box .top ,
174
+ width = label .bounding_box .width ,
175
+ height = label .bounding_box .height ,
176
+ page_number = page .page_number ,
177
+ page_width = page .page_width ,
178
+ page_height = page .page_height ,
179
+ segment_type = MARKER_TYPE_TO_TOKEN_TYPE [label .segment_type ],
180
+ )
181
+ )
182
+ return xml_segment_boxes
183
+
184
+
185
+ def remove_no_token_marker_labels (marker_labels : list [MarkerLabel ], pdf_tokens : list [PdfToken ]):
186
+ labels_to_keep = []
187
+ for pdf_token in pdf_tokens :
188
+ for marker_label in marker_labels :
189
+ if marker_label .bounding_box .get_intersection_percentage (pdf_token .bounding_box ):
190
+ labels_to_keep .append (marker_label )
191
+ break
192
+ return [marker_label for marker_label in marker_labels if marker_label in labels_to_keep ]
193
+
194
+
24
195
def get_segmentation_data (pdf_path ) -> SegmentationData :
25
- pdf_name = pdf_path .name .replace (".pdf" , ".picke" )
26
- segmentation_data_pickle = Path (SEGMENTATION_DATA_PATH , pdf_name )
27
- xml_segments_boxes = []
28
- if segmentation_data_pickle .exists ():
29
- with open (segmentation_data_pickle , "rb" ) as f :
30
- segmentation_data = pickle .load (f )
31
- xml_segments_boxes = segmentation_data .xml_segments_boxes
32
- return SegmentationData (page_width = 0 , page_height = 0 , xml_segments_boxes = xml_segments_boxes )
196
+ pdf_name = pdf_path .name .replace (".pdf" , "" )
197
+ json_path = Path (MARKER_JSONS_PATH , pdf_name , pdf_name + ".json" )
198
+ marker_document : MarkerDocument = MarkerDocument .from_json_path (json_path = json_path )
199
+ pdf_features = PdfFeatures .from_pdf_path (pdf_path = pdf_path )
200
+ for page in pdf_features .pages :
201
+ marker_page = [p for p in marker_document .pages if p .page_number == page .page_number ][0 ]
202
+ marker_page .page_labels = remove_no_token_marker_labels (marker_page .page_labels , page .tokens )
203
+
204
+ xml_segments_boxes = get_xml_segment_boxes (marker_document )
205
+ return SegmentationData (page_width = 0 , page_height = 0 , xml_segments_boxes = xml_segments_boxes , label_segments_boxes = [])
33
206
34
207
35
208
def get_pdf_data (pdf_name : str ):
0 commit comments