Skip to content

Commit 651e966

Browse files
authored
Merge pull request #28 from roleecorn/dev
Dev
2 parents 4ad44fe + 693c1f8 commit 651e966

18 files changed

+288
-167
lines changed

app.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from flask import Flask, render_template, jsonify, request
1+
from flask import Flask, render_template, jsonify
2+
from flask import send_from_directory
23
import flask
34
import time
45
import re
@@ -51,6 +52,11 @@ def start_cite():
5152
return jsonify(redirect_url='/src/index.html')
5253

5354

55+
@app.route('/favicon.ico')
56+
def favicon():
57+
return send_from_directory(home / 'src', 'web.ico')
58+
59+
5460
@app.route('/run_test')
5561
def run_test():
5662
thread = threading.Thread(target=background_task, args=('test',))
@@ -77,6 +83,8 @@ def run_ocr():
7783
thread = threading.Thread(target=background_task, args=('ocr',))
7884
thread.start()
7985
return jsonify(message="進行圖片光學解析")
86+
87+
8088
# run_ocr
8189
if __name__ == '__main__':
8290
app.run(debug=False)

cite_envs/eddiebauer.yml

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
lastupdate: 2023-09-08
22
target: tile_wrapper_outer
3-
nextpage:
4-
item: load_more
5-
method: extend
3+
nextpage: load_more
4+
method: append
65
position:
76
money:
87
X: 116

cite_envs/sample.yml

+16-15
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
1-
lastupdate: '2023-09-30 14:23:57.318304'
2-
nextpage: ''
1+
lastupdate: 2023-09-08
2+
target: tile_wrapper_outer
3+
nextpage: load_more
4+
method: append
35
position:
4-
money:
5-
X: 169
6-
Y: 200
7-
h: -18
8-
w: -166
9-
star:
10-
X: 0
11-
Y: 0
12-
h: 111
13-
w: 0
14-
title:
6+
money:
7+
X: 116
8+
Y: 460
9+
w: -114
10+
h: -14
11+
title:
1512
X: 4
1613
Y: 424
17-
h: 38
1814
w: 332
19-
target: tile_wrapper_outer
15+
h: 38
16+
star:
17+
X: 116
18+
Y: 485
19+
w: -27
20+
h: -22

classversion.py

+44-101
Original file line numberDiff line numberDiff line change
@@ -2,84 +2,57 @@
22
import sqlite3
33
from pathlib import Path
44
from datetime import datetime
5-
import os
65
import time
7-
6+
from typing import List
87
# Third-party imports
98
from dotenv import load_dotenv
109
from selenium.common.exceptions import NoSuchElementException
1110
from selenium.common.exceptions import ElementNotInteractableException
1211
import yaml
12+
from views.selenium_drive import driver_class
13+
from views.new_db import sql_add
1314

1415
# Load environment variables (special case)
1516
load_dotenv()
1617

1718
# Local application imports that depend on environment variables
18-
import util
19-
import driver_control
2019
from debugger import Debugger
20+
import driver_control
21+
import util
22+
2123

2224

23-
class ocr_crawler:
25+
class ocr_crawler(driver_class):
2426
def __init__(self, cite: str, test: bool = True) -> None:
27+
super().__init__()
2528
self.cite = cite
2629
self.home = Path.cwd()
2730
self.test = test
28-
# self.driverpath = self.home / 'chromedriver.exe'
29-
self.driverpath = Path(os.getenv("DriverPath"))
30-
self.chromepath = Path(os.getenv("Chromepath"))
31-
self.driver = None
3231
self.db_path = self.home / 'sql' / f"{cite}.db"
3332
self.read_csv()
34-
self.current_date = str(datetime.now().date())
33+
current_time = datetime.now()
34+
self.current_date = str(current_time.date())
35+
self.version = int(current_time.timestamp())
3536
if not self.db_path.exists():
36-
self.sql_add()
37+
Debugger.info_print("新增資料庫")
38+
sql_add(db_path=self.db_path, cite=cite)
3739
time.sleep(1)
3840
with open(self.home / 'cite_envs' / f"{cite}.yml", 'r') as file:
3941
cite_config = yaml.safe_load(file)
4042
self.target_class: str = cite_config.get('target', "")
4143
self.position: dict[str, dict[str, int]
4244
] = cite_config.get('position', {})
43-
self.nextpage: dict[str, str] = cite_config.get('nextpage', {})
44-
# self.SESSION = sqlite3.connect(self.db_path)
45-
self.version = int(time.time())
46-
47-
def sql_add(self) -> None:
48-
Debugger.info_print('new database')
49-
status = sqlite3.connect(self.db_path)
50-
query = f"""
51-
CREATE TABLE {self.cite} (
52-
name TEXT,
53-
imgcode TEXT,
54-
oriprice INT,
55-
price INT,
56-
color INT,
57-
feature TEXT,
58-
gender INT,
59-
brand TEXT,
60-
fabric TEXT,
61-
path TEXT,
62-
ver INT
63-
)
64-
"""
65-
status.execute(query)
66-
status.close()
45+
self.nextpage: str = cite_config.get('nextpage', "")
46+
self.nextpage_method: str = cite_config.get('method', "")
6747

6848
def read_csv(self) -> None:
6949
self.listsite, self.site_feature = util.read_csv(
7050
(self.home / 'cite_file' / f'{self.cite}_test.csv'))
7151

72-
def new_driver(self):
73-
Debugger.info_print('new driver')
74-
self.driver = util.new_driver(dpath=self.driverpath,
75-
cpath=self.chromepath)
76-
77-
def close(self):
78-
if self.driver:
79-
self.driver.close()
80-
81-
def one_page_start(self, imgpath: Path, img_features: list[str],
82-
ocr: bool = False):
52+
def one_page_capture(self, imgpath: Path,
53+
img_features: List[str],
54+
ocr: bool = False,
55+
save: bool = True):
8356
"""
8457
執行一個頁面的截圖
8558
"""
@@ -90,94 +63,67 @@ def one_page_start(self, imgpath: Path, img_features: list[str],
9063
"class name", self.target_class)
9164
except NoSuchElementException:
9265
Debugger.error_print('element not find in target_element')
66+
return
9367
SESSION = sqlite3.connect(self.db_path)
9468
for element in target_elements:
9569
tmp = util.capture(ele=element, path=imgpath)
9670
datas = {
97-
# "price": 100.50,
98-
# "oriprice": 150.00,
9971
"imgcode": tmp,
100-
# "facturer": "ABC Company",
101-
# "feature": "Waterproof",
102-
# "color": "Blue",
103-
# "name": "Cool Shoe",
104-
# "star": 4.5,
10572
"path": '/'.join(img_features),
106-
# "sex": 1,
107-
# # Assuming 0 for Female, 1 for Male, -1 for Unknown
10873
"ver": self.version
10974
}
11075
if ocr:
111-
datas["price"] = util.Ocr(img=imgpath/tmp,
112-
posit=self.position['money'])
113-
datas["name"] = util.Ocr(img=imgpath/tmp,
114-
posit=self.position['title'])
115-
datas["star"] = util.Ocr(img=imgpath/tmp,
116-
posit=self.position['star'])
76+
price = util.Ocr(img=imgpath/tmp,
77+
posit=self.position['money'])
78+
datas["price"] = util.remove_non_number(price)
79+
name = util.Ocr(img=imgpath/tmp,
80+
posit=self.position['title'])
81+
datas["name"] = util.remove_non_alphanumeric(name)
82+
star = util.Ocr(img=imgpath/tmp,
83+
posit=self.position['star'])
84+
datas["star"] = util.remove_non_number(star)
11785
clo = util.cloth(datas=datas)
11886
clo.writedb(db=SESSION, tablename=self.cite)
11987
del clo
120-
SESSION.commit()
88+
if save:
89+
SESSION.commit()
90+
else:
91+
SESSION.rollback()
12192
SESSION.close()
12293

12394
def test_start(self):
12495
"""
12596
執行第一個網址的執行
12697
"""
127-
Debugger.info_print('test start')
98+
Debugger.info_print('Test start')
12899
self.driver.get(url=self.listsite[0])
129100
time.sleep(3)
130101
if self.driver.current_url != self.listsite[0]:
131102
self.driver.get(url=self.listsite[0])
132103
p = self.home / 'image' / self.cite / self.current_date
133104
imgpath = util.check_imgpath(imgpath=p, imgfile=['test'])
134-
self.one_page_start(imgpath=imgpath, img_features=['test'])
105+
self.one_page_capture(imgpath=imgpath,
106+
img_features=['test'], save=False)
135107

136108
def regular_start(self, subcite: int, ocr: bool = False):
137109
"""
138110
執行選定目標編號網址
139111
"""
140112
img_features = self.site_feature[subcite]
141-
Debugger.info_print(f'regular start {img_features}')
142113
self.driver.get(url=self.listsite[subcite])
143114
if self.driver.current_url != self.listsite[subcite]:
144115
self.driver.get(url=self.listsite[subcite])
145116
p = self.home / 'image' / self.cite / self.current_date
146117
imgpath = util.check_imgpath(imgpath=p,
147118
imgfile=img_features)
148-
if self.nextpage['method'] == 'extend':
149-
while True:
150-
driver_control.go_bottom_and_wait(driver=self.driver)
151-
try:
152-
buttom = self.driver.find_element(
153-
"class name", self.nextpage['item'])
154-
self.driver.execute_script(
155-
"arguments[0].scrollIntoView();", buttom)
156-
buttom.click()
157-
except NoSuchElementException:
158-
Debugger.info_print('no nextpage')
159-
break
160-
except Exception as e:
161-
Debugger.error_print(str(e))
162-
break
163-
self.one_page_start(
119+
if self.nextpage_method == 'append':
120+
driver_control.append_page(obj=self)
121+
self.one_page_capture(
164122
imgpath=imgpath, img_features=img_features, ocr=ocr)
165-
elif self.nextpage['method'] == 'new':
166-
while True:
167-
self.one_page_start(
168-
imgpath=imgpath, img_features=img_features, ocr=ocr)
169-
try:
170-
buttom = self.driver.find_element(
171-
"class name", self.nextpage['item'])
172-
self.driver.execute_script(
173-
"arguments[0].scrollIntoView();", buttom)
174-
buttom.click()
175-
except NoSuchElementException:
176-
Debugger.info_print('no nextpage')
177-
break
178-
except Exception as e:
179-
Debugger.error_print(str(e))
180-
break
123+
elif self.nextpage_method == 'next':
124+
driver_control.next_page(obj=self, imgpath=imgpath,
125+
img_features=img_features,
126+
ocr=ocr)
181127
else:
182128
raise AttributeError
183129

@@ -206,10 +152,7 @@ def shot_all_classes(self):
206152
try:
207153
ele = self.driver.find_element(
208154
"class name", element)
209-
except NoSuchElementException:
210-
continue
211-
except Exception as e:
212-
# Debugger.error_print(str(e))
155+
except Exception:
213156
continue
214157
try:
215158
util.capture(ele=ele, path=search_path, name=element)

driver_control/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from .scroll_to_bottom_and_wait import scroll_to_bottom_and_wait
22
from .scroll_to_bottom_and_wait import go_bottom_and_wait
33
from .get_all_class import get_all_classes
4+
from .nextpage import append_page, next_page
45

56
__all__ = ["scroll_to_bottom_and_wait",
67
"get_all_classes",
78
"go_bottom_and_wait",
9+
'append_page', 'next_page',
810
]
9-
__version__ = "0.2"
11+
__version__ = "0.3"

driver_control/get_all_class.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from selenium import webdriver
2+
from typing import List
23

34

4-
def get_all_classes(driver: webdriver.Chrome) -> list[str]:
5+
def get_all_classes(driver: webdriver.Chrome) -> List[str]:
56
"""
67
Get all unique class names from the current page.
78
:param driver: The selenium webdriver instance.

driver_control/nextpage.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from selenium.common.exceptions import NoSuchElementException
2+
from selenium.webdriver import ActionChains
3+
from debugger import Debugger
4+
5+
6+
def next_page(obj, imgpath, img_features, ocr):
7+
while True:
8+
obj.one_page_capture(
9+
imgpath=imgpath, img_features=img_features, ocr=ocr)
10+
try:
11+
buttom = obj.driver.find_element(
12+
"class name", obj.nextpage)
13+
action = ActionChains(obj.driver)
14+
action.move_to_element(buttom)
15+
action.click().perform()
16+
# obj.driver.execute_script(
17+
# "arguments[0].scrollIntoView();", buttom)
18+
# buttom.click()
19+
except NoSuchElementException:
20+
break
21+
except Exception as e:
22+
Debugger.error_print(str(e))
23+
break
24+
25+
26+
def append_page(obj):
27+
while True:
28+
try:
29+
buttom = obj.driver.find_element(
30+
"class name", obj.nextpage)
31+
# obj.driver.execute_script(
32+
# "arguments[0].scrollIntoView();", buttom)
33+
# buttom.click()
34+
action = ActionChains(obj.driver)
35+
action.move_to_element(buttom)
36+
action.click().perform()
37+
except NoSuchElementException:
38+
Debugger.info_print('no nextpage')
39+
break
40+
except Exception as e:
41+
Debugger.error_print(str(e))
42+
break

0 commit comments

Comments
 (0)