2
2
import sqlite3
3
3
from pathlib import Path
4
4
from datetime import datetime
5
- import os
6
5
import time
7
-
6
+ from typing import List
8
7
# Third-party imports
9
8
from dotenv import load_dotenv
10
9
from selenium .common .exceptions import NoSuchElementException
11
10
from selenium .common .exceptions import ElementNotInteractableException
12
11
import yaml
12
+ from views .selenium_drive import driver_class
13
+ from views .new_db import sql_add
13
14
14
15
# Load environment variables (special case)
15
16
load_dotenv ()
16
17
17
18
# Local application imports that depend on environment variables
18
- import util
19
- import driver_control
20
19
from debugger import Debugger
20
+ import driver_control
21
+ import util
22
+
21
23
22
24
23
- class ocr_crawler :
25
+ class ocr_crawler ( driver_class ) :
24
26
def __init__ (self , cite : str , test : bool = True ) -> None :
27
+ super ().__init__ ()
25
28
self .cite = cite
26
29
self .home = Path .cwd ()
27
30
self .test = test
28
- # self.driverpath = self.home / 'chromedriver.exe'
29
- self .driverpath = Path (os .getenv ("DriverPath" ))
30
- self .chromepath = Path (os .getenv ("Chromepath" ))
31
- self .driver = None
32
31
self .db_path = self .home / 'sql' / f"{ cite } .db"
33
32
self .read_csv ()
34
- self .current_date = str (datetime .now ().date ())
33
+ current_time = datetime .now ()
34
+ self .current_date = str (current_time .date ())
35
+ self .version = int (current_time .timestamp ())
35
36
if not self .db_path .exists ():
36
- self .sql_add ()
37
+ Debugger .info_print ("新增資料庫" )
38
+ sql_add (db_path = self .db_path , cite = cite )
37
39
time .sleep (1 )
38
40
with open (self .home / 'cite_envs' / f"{ cite } .yml" , 'r' ) as file :
39
41
cite_config = yaml .safe_load (file )
40
42
self .target_class : str = cite_config .get ('target' , "" )
41
43
self .position : dict [str , dict [str , int ]
42
44
] = cite_config .get ('position' , {})
43
- self .nextpage : dict [str , str ] = cite_config .get ('nextpage' , {})
44
- # self.SESSION = sqlite3.connect(self.db_path)
45
- self .version = int (time .time ())
46
-
47
- def sql_add (self ) -> None :
48
- Debugger .info_print ('new database' )
49
- status = sqlite3 .connect (self .db_path )
50
- query = f"""
51
- CREATE TABLE { self .cite } (
52
- name TEXT,
53
- imgcode TEXT,
54
- oriprice INT,
55
- price INT,
56
- color INT,
57
- feature TEXT,
58
- gender INT,
59
- brand TEXT,
60
- fabric TEXT,
61
- path TEXT,
62
- ver INT
63
- )
64
- """
65
- status .execute (query )
66
- status .close ()
45
+ self .nextpage : str = cite_config .get ('nextpage' , "" )
46
+ self .nextpage_method : str = cite_config .get ('method' , "" )
67
47
68
48
def read_csv (self ) -> None :
69
49
self .listsite , self .site_feature = util .read_csv (
70
50
(self .home / 'cite_file' / f'{ self .cite } _test.csv' ))
71
51
72
- def new_driver (self ):
73
- Debugger .info_print ('new driver' )
74
- self .driver = util .new_driver (dpath = self .driverpath ,
75
- cpath = self .chromepath )
76
-
77
- def close (self ):
78
- if self .driver :
79
- self .driver .close ()
80
-
81
- def one_page_start (self , imgpath : Path , img_features : list [str ],
82
- ocr : bool = False ):
52
+ def one_page_capture (self , imgpath : Path ,
53
+ img_features : List [str ],
54
+ ocr : bool = False ,
55
+ save : bool = True ):
83
56
"""
84
57
執行一個頁面的截圖
85
58
"""
@@ -90,94 +63,67 @@ def one_page_start(self, imgpath: Path, img_features: list[str],
90
63
"class name" , self .target_class )
91
64
except NoSuchElementException :
92
65
Debugger .error_print ('element not find in target_element' )
66
+ return
93
67
SESSION = sqlite3 .connect (self .db_path )
94
68
for element in target_elements :
95
69
tmp = util .capture (ele = element , path = imgpath )
96
70
datas = {
97
- # "price": 100.50,
98
- # "oriprice": 150.00,
99
71
"imgcode" : tmp ,
100
- # "facturer": "ABC Company",
101
- # "feature": "Waterproof",
102
- # "color": "Blue",
103
- # "name": "Cool Shoe",
104
- # "star": 4.5,
105
72
"path" : '/' .join (img_features ),
106
- # "sex": 1,
107
- # # Assuming 0 for Female, 1 for Male, -1 for Unknown
108
73
"ver" : self .version
109
74
}
110
75
if ocr :
111
- datas ["price" ] = util .Ocr (img = imgpath / tmp ,
112
- posit = self .position ['money' ])
113
- datas ["name" ] = util .Ocr (img = imgpath / tmp ,
114
- posit = self .position ['title' ])
115
- datas ["star" ] = util .Ocr (img = imgpath / tmp ,
116
- posit = self .position ['star' ])
76
+ price = util .Ocr (img = imgpath / tmp ,
77
+ posit = self .position ['money' ])
78
+ datas ["price" ] = util .remove_non_number (price )
79
+ name = util .Ocr (img = imgpath / tmp ,
80
+ posit = self .position ['title' ])
81
+ datas ["name" ] = util .remove_non_alphanumeric (name )
82
+ star = util .Ocr (img = imgpath / tmp ,
83
+ posit = self .position ['star' ])
84
+ datas ["star" ] = util .remove_non_number (star )
117
85
clo = util .cloth (datas = datas )
118
86
clo .writedb (db = SESSION , tablename = self .cite )
119
87
del clo
120
- SESSION .commit ()
88
+ if save :
89
+ SESSION .commit ()
90
+ else :
91
+ SESSION .rollback ()
121
92
SESSION .close ()
122
93
123
94
def test_start (self ):
124
95
"""
125
96
執行第一個網址的執行
126
97
"""
127
- Debugger .info_print ('test start' )
98
+ Debugger .info_print ('Test start' )
128
99
self .driver .get (url = self .listsite [0 ])
129
100
time .sleep (3 )
130
101
if self .driver .current_url != self .listsite [0 ]:
131
102
self .driver .get (url = self .listsite [0 ])
132
103
p = self .home / 'image' / self .cite / self .current_date
133
104
imgpath = util .check_imgpath (imgpath = p , imgfile = ['test' ])
134
- self .one_page_start (imgpath = imgpath , img_features = ['test' ])
105
+ self .one_page_capture (imgpath = imgpath ,
106
+ img_features = ['test' ], save = False )
135
107
136
108
def regular_start (self , subcite : int , ocr : bool = False ):
137
109
"""
138
110
執行選定目標編號網址
139
111
"""
140
112
img_features = self .site_feature [subcite ]
141
- Debugger .info_print (f'regular start { img_features } ' )
142
113
self .driver .get (url = self .listsite [subcite ])
143
114
if self .driver .current_url != self .listsite [subcite ]:
144
115
self .driver .get (url = self .listsite [subcite ])
145
116
p = self .home / 'image' / self .cite / self .current_date
146
117
imgpath = util .check_imgpath (imgpath = p ,
147
118
imgfile = img_features )
148
- if self .nextpage ['method' ] == 'extend' :
149
- while True :
150
- driver_control .go_bottom_and_wait (driver = self .driver )
151
- try :
152
- buttom = self .driver .find_element (
153
- "class name" , self .nextpage ['item' ])
154
- self .driver .execute_script (
155
- "arguments[0].scrollIntoView();" , buttom )
156
- buttom .click ()
157
- except NoSuchElementException :
158
- Debugger .info_print ('no nextpage' )
159
- break
160
- except Exception as e :
161
- Debugger .error_print (str (e ))
162
- break
163
- self .one_page_start (
119
+ if self .nextpage_method == 'append' :
120
+ driver_control .append_page (obj = self )
121
+ self .one_page_capture (
164
122
imgpath = imgpath , img_features = img_features , ocr = ocr )
165
- elif self .nextpage ['method' ] == 'new' :
166
- while True :
167
- self .one_page_start (
168
- imgpath = imgpath , img_features = img_features , ocr = ocr )
169
- try :
170
- buttom = self .driver .find_element (
171
- "class name" , self .nextpage ['item' ])
172
- self .driver .execute_script (
173
- "arguments[0].scrollIntoView();" , buttom )
174
- buttom .click ()
175
- except NoSuchElementException :
176
- Debugger .info_print ('no nextpage' )
177
- break
178
- except Exception as e :
179
- Debugger .error_print (str (e ))
180
- break
123
+ elif self .nextpage_method == 'next' :
124
+ driver_control .next_page (obj = self , imgpath = imgpath ,
125
+ img_features = img_features ,
126
+ ocr = ocr )
181
127
else :
182
128
raise AttributeError
183
129
@@ -206,10 +152,7 @@ def shot_all_classes(self):
206
152
try :
207
153
ele = self .driver .find_element (
208
154
"class name" , element )
209
- except NoSuchElementException :
210
- continue
211
- except Exception as e :
212
- # Debugger.error_print(str(e))
155
+ except Exception :
213
156
continue
214
157
try :
215
158
util .capture (ele = ele , path = search_path , name = element )
0 commit comments