1
1
import subprocess
2
- import platform
2
+ import platform , os
3
3
from io import BytesIO
4
4
import tempfile
5
- import pyautogui , pyperclip ,time
6
-
7
- def from_buffer (stream ):
8
- """Read the buffer and return the data."""
9
- return stream
10
-
11
- def from_path (path ):
12
- """Read the file and return the data."""
13
- with open (path , 'rb' ) as stream :
14
- return from_buffer (BytesIO (stream .read ()))
15
-
16
- def _openChromePDF (stream ):
17
- """Open the PDF in Chrome."""
18
- # Leitura dos dados do stream sem fechar o arquivo
19
- data = stream .read ()
20
- if platform .system () == 'Windows' :
21
- chrome_path = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
22
- command = [chrome_path , '-' ]
23
- elif platform .system () == 'Darwin' :
24
- command = ['open' , '-na' , 'Google Chrome' , '--args' , '-' ]
25
- elif platform .system () == 'Linux' :
26
- with tempfile .NamedTemporaryFile (suffix = ".pdf" ,mode = 'wb' ) as temp :
27
- temp .write (data )
28
- temp .flush ()
29
- command = ['xdg-open' , temp .name ]
30
- process = subprocess .run (command , input = data , check = True , capture_output = True )
31
- copy_text ()
32
- print (process .stdout )
33
- print (process .stderr )
34
- else :
35
- raise NotImplementedError ('Your OS is not supported.' )
36
-
37
- if platform .system () != 'Linux' :
38
- process = subprocess .run (command , input = data , check = True , capture_output = True )
39
- print (process .stdout )
40
- print (process .stderr )
41
-
42
- def copy_text ():
43
- # Ajuste as coordenadas conforme necessário para a posição do texto no PDF
44
- pyautogui .moveTo (500 , 500 ) # Ajuste as coordenadas conforme necessário
45
- pyautogui .click ()
46
-
47
- # Seleciona todo o texto (Ctrl + A)
48
- pyautogui .hotkey ('ctrl' , 'a' )
49
-
50
- # Copia o texto (Ctrl + C)
51
- pyautogui .hotkey ('ctrl' , 'c' )
52
-
53
- def paste_to_file (file_path ):
54
- # Pausa para dar tempo de copiar o texto
55
- time .sleep (1 )
56
-
57
- # Cola o texto em um arquivo
58
- pyautogui .hotkey ('ctrl' , 'v' )
59
-
60
- # Salva o arquivo (Ctrl + S)
61
- pyautogui .hotkey ('ctrl' , 's' )
62
- pyautogui .write (file_path )
63
- pyautogui .press ('enter' )
64
-
5
+ import pyperclip
6
+ import time ,requests
65
7
from selenium import webdriver
66
8
from selenium .webdriver .common .keys import Keys
67
- import time
68
-
69
- from selenium import webdriver
70
- import time
71
-
72
- # Iniciar o WebDriver do Selenium
73
- driver = webdriver .Chrome (executable_path = '/usr/lib/chromium-browser/chromedriver' )
74
-
75
- # URL do PDF ou link para um arquivo PDF
76
- url_pdf = 'file:///home/lordwaif/PyJPDF/teste2.pdf'
77
-
78
- # Navegar até o link do PDF
79
- driver .get (url_pdf )
80
-
81
- # Esperar alguns segundos para garantir que o PDF seja carregado
82
- time .sleep (5 )
83
-
84
- # Fechar o navegador
85
- driver .quit ()
9
+ from selenium .webdriver .chrome .service import Service as ChromeService
10
+ from webdriver_manager .chrome import ChromeDriverManager
11
+ from selenium .webdriver .common .by import By
12
+ from selenium .webdriver .support .ui import WebDriverWait
13
+ from selenium .webdriver .support import expected_conditions as EC
14
+ import tempfile
86
15
16
+ def convertPDF (pdf_path_input , pdf_path_output = None ,max_tries = 3 ,executable_path = None ,options_list = [],time_beetwen_tries = 1 ,encoding = 'utf-8' ):
17
+ """
18
+ Converts a PDF file to text using a headless Chrome browser.
19
+
20
+ Args:
21
+ pdf_path_input (str): The path to the input PDF file.
22
+ pdf_path_output (str, optional): The path to save the output text file. If not provided, the text will be returned as a string. Defaults to None.
23
+ max_tries (int, optional): The maximum number of tries to copy the text from the PDF. Defaults to 3.
24
+ executable_path (str, optional): The path to the ChromeDriver executable. If not provided, the default ChromeDriver will be used. Defaults to None.
25
+ options_list (list, optional): Additional options to pass to the Chrome browser. Defaults to [].
26
+ time_beetwen_tries (int, optional): The time to wait between each try to copy the text. Defaults to 1.
27
+ encoding (str, optional): The encoding to use when saving the output text file. Defaults to 'utf-8'.
28
+
29
+ Returns:
30
+ str: The extracted text from the PDF file, if pdf_path_output is not provided.
31
+
32
+ Raises:
33
+ Exception: If the text cannot be extracted from the PDF file after the maximum number of tries.
34
+ """
35
+ options = webdriver .ChromeOptions ()
36
+ for i in options_list :
37
+ options .add_argument (i )
38
+ # options.add_argument("--disable-gpu")
39
+ # options.add_argument("--no-sandbox")
40
+ # options.add_argument("--headless")
41
+ options .add_argument ("--window-size=50,50" )
42
+ if executable_path is None :
43
+ driver = webdriver .Chrome (service = ChromeService (ChromeDriverManager ().install ()), options = options )
44
+ else :
45
+ driver = webdriver .Chrome (executable_path = executable_path , options = options )
46
+ driver .get ('file:///' + pdf_path_input )
47
+ driver .minimize_window ()
48
+
49
+ chrome_version = driver .capabilities ['chrome' ]['chromedriverVersion' ]
50
+ # print(f"Versão do ChromeDriver: {chrome_version}")
51
+
52
+ embed_elemento = WebDriverWait (driver , 10 ).until (
53
+ EC .visibility_of_element_located ((By .TAG_NAME , 'embed' ))
54
+ )
55
+ driver .execute_script ("arguments[0].click();" , embed_elemento )
56
+
57
+ texto_copiado = 'Não copiei nada'
58
+ pyperclip .copy ('Não copiei nada' )
59
+ max_tries = 3
60
+ while True :
61
+ webdriver .ActionChains (driver ).move_to_element (embed_elemento ).click ().perform ()
62
+
63
+ webdriver .ActionChains (driver ).key_down (Keys .CONTROL ).send_keys ('a' ).key_up (Keys .CONTROL ).perform ()
64
+ webdriver .ActionChains (driver ).key_down (Keys .CONTROL ).send_keys ('c' ).key_up (Keys .CONTROL ).perform ()
65
+
66
+ texto_copiado = pyperclip .paste ()
67
+
68
+ if texto_copiado == 'Não copiei nada' :
69
+ time .sleep (time_beetwen_tries )
70
+ else :
71
+ break
72
+ max_tries -= 1
73
+ if max_tries == 0 :
74
+ raise Exception ('Não foi possível obter o texto do PDF' )
75
+ if pdf_path_output is None :
76
+ return texto_copiado
77
+ else :
78
+ with open (pdf_path_output , 'w' , encoding = encoding ) as f :
79
+ f .write (texto_copiado )
80
+ driver .quit ()
81
+
82
+ def from_buffer (stream ,* args , ** kwargs ):
83
+ """
84
+ Read the buffer and return the data.
85
+
86
+ Parameters:
87
+ - stream: The buffer to read from.
88
+
89
+ Returns:
90
+ - pdf_content: The content of the PDF file.
91
+ """
92
+ # Criar tempfile
93
+ with tempfile .NamedTemporaryFile (delete = False ) as temp :
94
+ temp .write (stream .read ())
95
+ temp_path = temp .name
96
+ pdf_content = convertPDF (temp_path ,* args , ** kwargs )
97
+ os .remove (temp_path )
98
+ return pdf_content
99
+
100
+ def from_path (path ,* args , ** kwargs ):
101
+ """
102
+ Read the file from the given path and return the data.
103
+
104
+ Args:
105
+ path (str): The path to the file.
106
+
107
+ Returns:
108
+ The data read from the file.
109
+ """
110
+ return convertPDF (path ,* args , ** kwargs )
111
+
112
+ def from_url (url ,* args , ** kwargs ):
113
+ """
114
+ Read the file from the given URL and return the data.
115
+
116
+ Parameters:
117
+ url (str): The URL of the file to be read.
118
+ *args: Additional positional arguments to be passed to the `from_buffer` function.
119
+ **kwargs: Additional keyword arguments to be passed to the `from_buffer` function.
120
+
121
+ Returns:
122
+ The data read from the file.
123
+ """
124
+ response = requests .get (url )
125
+ return from_buffer (BytesIO (response .content ),* args , ** kwargs )
87
126
88
127
if __name__ == '__main__' :
89
- #_openChromePDF(from_path('teste.pdf'))
90
- #paste_to_file('teste.txt')
91
- ...
92
-
128
+ path = 'C:/Users/bibil/Documents/PyJPDF/teste2.pdf'
129
+ import io
130
+ from_buffer (io .BytesIO (open (path , 'rb' ).read ()),'teste_saida.txt' )
0 commit comments