Skip to content

Commit ef5819a

Browse files
committed
complete main function
1 parent 8b2a245 commit ef5819a

File tree

5 files changed

+185
-86
lines changed

5 files changed

+185
-86
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
venv/
1+
venv/
2+
*.pdf
3+
4+
__pycache__/

__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
from .utils import from_buffer
1+
from .pyjpdf import PyjPDFExtract
2+
3+
__author__ = 'Jasson Carvalho'
4+
__version__ = '0.1.0'

pyjpdf.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from .utils import from_buffer,from_path,from_url
2+
import platform
3+
4+
class PyjPDFExtract:
5+
"""
6+
PyjPDF is a class that provides methods for creating PDF documents from various sources.
7+
"""
8+
9+
def __init__(self):
10+
if platform.system() != 'Windows':
11+
raise Exception('PyjPDF is only available for Windows for now')
12+
13+
def from_buffer(self, stream, *args, **kwargs):
14+
"""
15+
Load a PDF document from a buffer.
16+
17+
Args:
18+
stream: The buffer containing the PDF data.
19+
*args: Additional positional arguments.
20+
**kwargs: Additional keyword arguments.
21+
22+
Returns:
23+
The created PDF object.
24+
"""
25+
return from_buffer(stream, *args, **kwargs)
26+
27+
def from_path(self, path, *args, **kwargs):
28+
"""
29+
Load a PDF document from a file path.
30+
31+
Args:
32+
path (str): The path to the PDF file.
33+
*args: Additional positional arguments.
34+
**kwargs: Additional keyword arguments.
35+
36+
Returns:
37+
The loaded PDF document.
38+
39+
"""
40+
return from_path(path, *args, **kwargs)
41+
42+
def from_url(self, url, *args, **kwargs):
43+
"""
44+
Load a PDF document from a URL.
45+
46+
Args:
47+
url (str): The URL of the PDF document.
48+
*args: Additional positional arguments to pass to the underlying `from_url` function.
49+
**kwargs: Additional keyword arguments to pass to the underlying `from_url` function.
50+
51+
Returns:
52+
The loaded PDF document.
53+
54+
"""
55+
return from_url(url, *args, **kwargs)

requirements.txt

196 Bytes
Binary file not shown.

utils.py

Lines changed: 122 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,130 @@
11
import subprocess
2-
import platform
2+
import platform,os
33
from io import BytesIO
44
import tempfile
5-
import pyautogui, pyperclip,time
6-
7-
def from_buffer(stream):
8-
"""Read the buffer and return the data."""
9-
return stream
10-
11-
def from_path(path):
12-
"""Read the file and return the data."""
13-
with open(path, 'rb') as stream:
14-
return from_buffer(BytesIO(stream.read()))
15-
16-
def _openChromePDF(stream):
17-
"""Open the PDF in Chrome."""
18-
# Leitura dos dados do stream sem fechar o arquivo
19-
data = stream.read()
20-
if platform.system() == 'Windows':
21-
chrome_path = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
22-
command = [chrome_path, '-']
23-
elif platform.system() == 'Darwin':
24-
command = ['open', '-na', 'Google Chrome', '--args', '-']
25-
elif platform.system() == 'Linux':
26-
with tempfile.NamedTemporaryFile(suffix=".pdf",mode='wb') as temp:
27-
temp.write(data)
28-
temp.flush()
29-
command = ['xdg-open', temp.name]
30-
process = subprocess.run(command, input=data, check=True, capture_output=True)
31-
copy_text()
32-
print(process.stdout)
33-
print(process.stderr)
34-
else:
35-
raise NotImplementedError('Your OS is not supported.')
36-
37-
if platform.system() != 'Linux':
38-
process = subprocess.run(command, input=data, check=True, capture_output=True)
39-
print(process.stdout)
40-
print(process.stderr)
41-
42-
def copy_text():
43-
# Ajuste as coordenadas conforme necessário para a posição do texto no PDF
44-
pyautogui.moveTo(500, 500) # Ajuste as coordenadas conforme necessário
45-
pyautogui.click()
46-
47-
# Seleciona todo o texto (Ctrl + A)
48-
pyautogui.hotkey('ctrl', 'a')
49-
50-
# Copia o texto (Ctrl + C)
51-
pyautogui.hotkey('ctrl', 'c')
52-
53-
def paste_to_file(file_path):
54-
# Pausa para dar tempo de copiar o texto
55-
time.sleep(1)
56-
57-
# Cola o texto em um arquivo
58-
pyautogui.hotkey('ctrl', 'v')
59-
60-
# Salva o arquivo (Ctrl + S)
61-
pyautogui.hotkey('ctrl', 's')
62-
pyautogui.write(file_path)
63-
pyautogui.press('enter')
64-
5+
import pyperclip
6+
import time,requests
657
from selenium import webdriver
668
from selenium.webdriver.common.keys import Keys
67-
import time
68-
69-
from selenium import webdriver
70-
import time
71-
72-
# Iniciar o WebDriver do Selenium
73-
driver = webdriver.Chrome(executable_path='/usr/lib/chromium-browser/chromedriver')
74-
75-
# URL do PDF ou link para um arquivo PDF
76-
url_pdf = 'file:///home/lordwaif/PyJPDF/teste2.pdf'
77-
78-
# Navegar até o link do PDF
79-
driver.get(url_pdf)
80-
81-
# Esperar alguns segundos para garantir que o PDF seja carregado
82-
time.sleep(5)
83-
84-
# Fechar o navegador
85-
driver.quit()
9+
from selenium.webdriver.chrome.service import Service as ChromeService
10+
from webdriver_manager.chrome import ChromeDriverManager
11+
from selenium.webdriver.common.by import By
12+
from selenium.webdriver.support.ui import WebDriverWait
13+
from selenium.webdriver.support import expected_conditions as EC
14+
import tempfile
8615

16+
def convertPDF(pdf_path_input, pdf_path_output = None,max_tries=3,executable_path=None,options_list=[],time_beetwen_tries=1,encoding='utf-8'):
17+
"""
18+
Converts a PDF file to text using a headless Chrome browser.
19+
20+
Args:
21+
pdf_path_input (str): The path to the input PDF file.
22+
pdf_path_output (str, optional): The path to save the output text file. If not provided, the text will be returned as a string. Defaults to None.
23+
max_tries (int, optional): The maximum number of tries to copy the text from the PDF. Defaults to 3.
24+
executable_path (str, optional): The path to the ChromeDriver executable. If not provided, the default ChromeDriver will be used. Defaults to None.
25+
options_list (list, optional): Additional options to pass to the Chrome browser. Defaults to [].
26+
time_beetwen_tries (int, optional): The time to wait between each try to copy the text. Defaults to 1.
27+
encoding (str, optional): The encoding to use when saving the output text file. Defaults to 'utf-8'.
28+
29+
Returns:
30+
str: The extracted text from the PDF file, if pdf_path_output is not provided.
31+
32+
Raises:
33+
Exception: If the text cannot be extracted from the PDF file after the maximum number of tries.
34+
"""
35+
options = webdriver.ChromeOptions()
36+
for i in options_list:
37+
options.add_argument(i)
38+
# options.add_argument("--disable-gpu")
39+
# options.add_argument("--no-sandbox")
40+
# options.add_argument("--headless")
41+
options.add_argument("--window-size=50,50")
42+
if executable_path is None:
43+
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
44+
else:
45+
driver = webdriver.Chrome(executable_path=executable_path, options=options)
46+
driver.get('file:///'+pdf_path_input)
47+
driver.minimize_window()
48+
49+
chrome_version = driver.capabilities['chrome']['chromedriverVersion']
50+
# print(f"Versão do ChromeDriver: {chrome_version}")
51+
52+
embed_elemento = WebDriverWait(driver, 10).until(
53+
EC.visibility_of_element_located((By.TAG_NAME, 'embed'))
54+
)
55+
driver.execute_script("arguments[0].click();", embed_elemento)
56+
57+
texto_copiado = 'Não copiei nada'
58+
pyperclip.copy('Não copiei nada')
59+
max_tries = 3
60+
while True:
61+
webdriver.ActionChains(driver).move_to_element(embed_elemento).click().perform()
62+
63+
webdriver.ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
64+
webdriver.ActionChains(driver).key_down(Keys.CONTROL).send_keys('c').key_up(Keys.CONTROL).perform()
65+
66+
texto_copiado = pyperclip.paste()
67+
68+
if texto_copiado == 'Não copiei nada':
69+
time.sleep(time_beetwen_tries)
70+
else:
71+
break
72+
max_tries -= 1
73+
if max_tries == 0:
74+
raise Exception('Não foi possível obter o texto do PDF')
75+
if pdf_path_output is None:
76+
return texto_copiado
77+
else:
78+
with open(pdf_path_output, 'w', encoding=encoding) as f:
79+
f.write(texto_copiado)
80+
driver.quit()
81+
82+
def from_buffer(stream,*args, **kwargs):
83+
"""
84+
Read the buffer and return the data.
85+
86+
Parameters:
87+
- stream: The buffer to read from.
88+
89+
Returns:
90+
- pdf_content: The content of the PDF file.
91+
"""
92+
# Criar tempfile
93+
with tempfile.NamedTemporaryFile(delete=False) as temp:
94+
temp.write(stream.read())
95+
temp_path = temp.name
96+
pdf_content = convertPDF(temp_path,*args, **kwargs)
97+
os.remove(temp_path)
98+
return pdf_content
99+
100+
def from_path(path,*args, **kwargs):
101+
"""
102+
Read the file from the given path and return the data.
103+
104+
Args:
105+
path (str): The path to the file.
106+
107+
Returns:
108+
The data read from the file.
109+
"""
110+
return convertPDF(path,*args, **kwargs)
111+
112+
def from_url(url,*args, **kwargs):
113+
"""
114+
Read the file from the given URL and return the data.
115+
116+
Parameters:
117+
url (str): The URL of the file to be read.
118+
*args: Additional positional arguments to be passed to the `from_buffer` function.
119+
**kwargs: Additional keyword arguments to be passed to the `from_buffer` function.
120+
121+
Returns:
122+
The data read from the file.
123+
"""
124+
response = requests.get(url)
125+
return from_buffer(BytesIO(response.content),*args, **kwargs)
87126

88127
if __name__ == '__main__':
89-
#_openChromePDF(from_path('teste.pdf'))
90-
#paste_to_file('teste.txt')
91-
...
92-
128+
path = 'C:/Users/bibil/Documents/PyJPDF/teste2.pdf'
129+
import io
130+
from_buffer(io.BytesIO(open(path, 'rb').read()),'teste_saida.txt')

0 commit comments

Comments
 (0)