Skip to content

Commit 6587c26

Browse files
committed
Text coordinates in markup.
1 parent 519a5c3 commit 6587c26

File tree

4 files changed

+246
-27
lines changed

4 files changed

+246
-27
lines changed

auxil.py

Lines changed: 199 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
import consts
33

44
import re
5-
from loguru import logger
65
from sys import stdout, platform
76
from random import randint, choice
87
import os
98
import subprocess as sb
109
from argparse import ArgumentTypeError
1110

11+
from PyPDF2 import PdfReader
12+
from loguru import logger
13+
1214
def logger_config(v):
1315
logger.remove()
1416
if int(v) == 0:
@@ -243,3 +245,199 @@ def calculate_seal_coords(sign_coords, new_page=False):
243245
seal_coords = [[x1-pd, y1-pd], [x2+pd, y2+pd]]
244246

245247
return seal_coords
248+
249+
# coords: list - список из пар координат (x, y)
250+
def calculate_borders(original_coords, creator_and_date=False, task=False):
251+
252+
def calculate(coords):
253+
if len(coords) == 1:
254+
x1 = min(coords)
255+
y1 = coords[0][1]
256+
y2 = coords[0][1]
257+
258+
elif len(coords) > 1:
259+
x1 = min(coords)
260+
y1 = 10000
261+
y2 = 0
262+
263+
for pair in coords:
264+
if pair[1] < y1:
265+
y1 = pair[1]
266+
267+
if pair[1] > y2:
268+
y2 = pair[1]
269+
270+
else:
271+
return []
272+
273+
x1 = PDFunits_to_px(x1[0])
274+
y1 = PDFunits_to_px(y1)
275+
y2 = PDFunits_to_px(y2)
276+
277+
x_offset = consts.text_borders[0]
278+
y_offset = consts.text_borders[1]
279+
280+
if task:
281+
return [[x1 - x_offset, y1 - y_offset * 2], [2385, y2 + y_offset / 2]]
282+
elif creator_and_date:
283+
return [[x1 - x_offset, y1 - y_offset * 0.75], [2385, y2 + y_offset / 4]]
284+
else:
285+
return [[x1 - x_offset, y1 - y_offset * 1.6], [2385, y2 + y_offset / 2]]
286+
287+
if original_coords == ["page_break"]:
288+
return original_coords
289+
elif "page_break" in original_coords:
290+
splitted_coords = []
291+
result = []
292+
for pair in original_coords:
293+
if pair != "page_break":
294+
splitted_coords.append(pair)
295+
else:
296+
result.append(calculate(splitted_coords))
297+
result.append("page_break")
298+
splitted_coords = []
299+
else:
300+
return calculate(original_coords)
301+
302+
return result
303+
304+
305+
# pdf_path: str - путь к pdf файлу
306+
# data: tuple - кортеж данных для генерации
307+
def calculate_text_coords(pdf_path, data):
308+
header, header_coords = data[0], []
309+
name, name_coords = data[1], []
310+
intro, intro_coords = data[2], []
311+
instruction, instruction_coords = data[3], [[] * i for i in range(len(data[3]))]
312+
responsible, responsible_coords = data[4], []
313+
creator, creator_coords = data[5], []
314+
date, date_coords = data[6], []
315+
316+
reader = PdfReader(pdf_path)
317+
for page in reader.pages:
318+
# page = reader.pages[0]
319+
320+
raw_data = []
321+
def visitor_t(text, cm, tm, fontDict, fontSize):
322+
raw_data.append([text, tm[4], tm[5]]) # [text, x1, y1]
323+
324+
page.extract_text(visitor_text=visitor_t) # Посимвольное извлечение координат текста
325+
326+
for i in range(len(raw_data)):
327+
raw_data[i][1] = int(raw_data[i][1])
328+
raw_data[i][2] = int(raw_data[i][2])
329+
330+
# Объединение символов с одинаковыми координатами в строки
331+
# text - список со строками
332+
# coords - список с соответствующими строкам координатами
333+
text, coords = [], []
334+
for i in range(len(raw_data)):
335+
if [raw_data[i][1], raw_data[i][2]] not in coords:
336+
coords.append([raw_data[i][1], raw_data[i][2]])
337+
text.append(raw_data[i][0])
338+
else:
339+
text[-1] += raw_data[i][0]
340+
341+
if (len(text) != len(coords)):
342+
logger.error("[text] != [coords]")
343+
raise SystemExit
344+
345+
formatted_markup = {}
346+
for i in range(len(text)):
347+
if text[i] == '':
348+
continue
349+
350+
formatted_markup[text[i]] = coords[i]
351+
352+
# Если ключ словаря (строка в документе) входит в секцию данных для генерации,
353+
# то добавить координаты строки в соответствующий список
354+
for k in list(formatted_markup):
355+
356+
if ((k.replace('\n', '') in header) and (not name_coords)):
357+
val = formatted_markup.pop(k)
358+
header_coords.append(val)
359+
360+
if ((k.replace('\n', '') in name) and (not intro_coords)):
361+
try:
362+
val = formatted_markup.pop(k)
363+
name_coords.append(val)
364+
except KeyError:
365+
pass
366+
367+
if (k.replace('\n', '') in intro):
368+
try:
369+
val = formatted_markup.pop(k)
370+
intro_coords.append(val)
371+
except KeyError:
372+
pass
373+
374+
for i in range(len(instruction)):
375+
task = instruction[i]["task_text"]
376+
if (k.replace('\n', '') in task):
377+
try:
378+
val = formatted_markup.pop(k)
379+
instruction_coords[i].append(val)
380+
except KeyError:
381+
pass
382+
383+
if (k.replace('\n', '') in responsible):
384+
try:
385+
val = formatted_markup.pop(k)
386+
responsible_coords.append(val)
387+
except KeyError:
388+
pass
389+
390+
if (k.replace('\n', '') in creator):
391+
try:
392+
val = formatted_markup.pop(k)
393+
creator_coords.append(val)
394+
except KeyError:
395+
pass
396+
397+
if (k.replace('\n', '') in '.' + date):
398+
try:
399+
val = formatted_markup.pop(k)
400+
date_coords.append(val)
401+
except KeyError:
402+
pass
403+
404+
header_coords.append("page_break")
405+
name_coords.append("page_break")
406+
intro_coords.append("page_break")
407+
instruction_coords.append(["page_break"])
408+
responsible_coords.append("page_break")
409+
creator_coords.append("page_break")
410+
date_coords.append("page_break")
411+
412+
413+
header_coords = calculate_borders(header_coords)
414+
415+
name_coords = calculate_borders(name_coords)
416+
417+
intro_coords = calculate_borders(intro_coords)
418+
419+
task_coords = []
420+
for task in instruction_coords:
421+
task_coords.append(calculate_borders(task, task=True))
422+
423+
responsible_coords = calculate_borders(responsible_coords)
424+
425+
creator_coords = calculate_borders(creator_coords, creator_and_date=True)
426+
427+
date_coords = calculate_borders(date_coords, creator_and_date=True)
428+
429+
# try:
430+
# intro_coords[0][1] -= 55
431+
# except TypeError:
432+
# pass
433+
434+
# for i in range(len(task_coords)):
435+
# try:
436+
# task_coords[i][0][1] -= 80
437+
# except TypeError:
438+
# continue
439+
440+
result = [header_coords, name_coords, intro_coords, task_coords, responsible_coords,
441+
creator_coords, date_coords]
442+
443+
return result

consts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,5 @@
4242

4343
page_w = 612 # PDFUnits
4444
page_h = 792 # PDFUnits
45+
46+
text_borders = (20, 60) # text offset in px

gen.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,14 @@ def generate(data, out, formats, size, samples_dir, is_image):
115115

116116
if 'p' in formats:
117117
pdf_path = write.write_pdf_linux(docx_path, out, count)
118+
119+
generation_data = (header, name, intro, instruction,
120+
responsible, creator, date[0])
121+
118122
if is_image:
119-
write.write_coords(json_path, pdf_path)
123+
write.write_coords(json_path, pdf_path, generation_data, is_image=True)
124+
else:
125+
write.write_coords(json_path, pdf_path, generation_data)
120126

121127
if 'j' in formats:
122128
write.write_jpg(out, count)

write.py

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -197,39 +197,52 @@ def visitor_sign(text, cm, tm, fontDict, fontSize):
197197

198198
return (tmx, tmy, len(page.images))
199199

200-
def write_coords(json_path, pdf_path):
201-
# Координаты логотипа
202-
logo_coords = auxil.calculate_logo_coords()
200+
def write_coords(json_path, pdf_path, data, is_image=False):
203201

204-
# Координаты подписи
205-
(tmx, tmy, im_count) = extract_tm(pdf_path, -1)
202+
with open(json_path, "r") as json_file:
203+
json_dict = json.load(json_file)
206204

207-
if im_count >= 2: # Если на последней странице есть подпись и печать
205+
if is_image:
208206

209-
if (tmx == 0) or (tmy == 0): # Если на странице только подпись и печать
210-
sign_coords = auxil.calculate_sign_coords(tmx, tmy, new_page=True)
211-
212-
else:
213-
sign_coords = auxil.calculate_sign_coords(tmx, tmy)
207+
# Координаты логотипа
208+
logo_coords = auxil.calculate_logo_coords()
214209

215-
else: # Если на последней странице только печать
216-
(tmx, tmy, _) = extract_tm(pdf_path, -2)
217-
sign_coords = auxil.calculate_sign_coords(tmx, tmy)
210+
# Координаты подписи
211+
(tmx, tmy, im_count) = extract_tm(pdf_path, -1)
218212

219-
# Координаты печати
220-
if im_count >= 2:
221-
seal_coords = auxil.calculate_seal_coords(sign_coords)
213+
if im_count >= 2: # Если на последней странице есть подпись и печать
222214

223-
else:
224-
seal_coords = auxil.calculate_seal_coords([], new_page=True)
215+
if (tmx == 0) or (tmy == 0): # Если на странице только подпись и печать
216+
sign_coords = auxil.calculate_sign_coords(tmx, tmy, new_page=True)
217+
218+
else:
219+
sign_coords = auxil.calculate_sign_coords(tmx, tmy)
225220

226-
with open(json_path, "r") as json_file:
227-
json_dict = json.load(json_file)
221+
else: # Если на последней странице только печать
222+
(tmx, tmy, _) = extract_tm(pdf_path, -2)
223+
sign_coords = auxil.calculate_sign_coords(tmx, tmy)
224+
225+
# Координаты печати
226+
if im_count >= 2:
227+
seal_coords = auxil.calculate_seal_coords(sign_coords)
228+
229+
else:
230+
seal_coords = auxil.calculate_seal_coords([], new_page=True)
228231

229-
json_dict["Images"] = {}
230-
json_dict["Images"]["logo_coordinates"] = logo_coords
231-
json_dict["Images"]["signature_coordinates"] = sign_coords
232-
json_dict["Images"]["seal_coordinates"] = seal_coords
232+
json_dict["Images"] = {}
233+
json_dict["Images"]["logo_coordinates"] = logo_coords
234+
json_dict["Images"]["signature_coordinates"] = sign_coords
235+
json_dict["Images"]["seal_coordinates"] = seal_coords
236+
237+
text_coords = auxil.calculate_text_coords(pdf_path, data)
238+
json_dict["Text"] = {}
239+
json_dict["Text"]["header"] = text_coords[0]
240+
json_dict["Text"]["name"] = text_coords[1]
241+
json_dict["Text"]["intro"] = text_coords[2]
242+
json_dict["Text"]["tasks"] = text_coords[3]
243+
json_dict["Text"]["responsible"] = text_coords[4]
244+
json_dict["Text"]["creator"] = text_coords[5]
245+
json_dict["Text"]["date"] = text_coords[6]
233246

234247
with open(json_path, "w") as jsonf:
235248
json.dump(json_dict, jsonf, ensure_ascii=False, indent=4)

0 commit comments

Comments
 (0)