Skip to content

Commit ef7b2b8

Browse files
committed
Add web driver scraping code (#6)
1 parent 032605a commit ef7b2b8

15 files changed

+427
-18
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
.idea/*
22
db_utils/cred
3-
venv/
3+
venv/
4+
*_pids.txt

scrapers/main.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from scrapers.scraper_component import get_scraping_driver
2+
3+
if __name__ == '__main__':
4+
driver = get_scraping_driver(via_request=False)
5+
print(driver.get_current_url())
Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
from functools import cache
2-
1+
from scrapers.scraper_component.chrome_driver import ChromeDriver
32
from scrapers.scraper_component.requests_driver import RequestsDriver
43

54

6-
@cache
7-
def get_scraping_driver(via_request: bool = True):
5+
def get_scraping_driver(via_request: bool = True, *args, **kwargs):
86
if via_request:
9-
return RequestsDriver()
10-
# else:
11-
# return WebDriver()
7+
return RequestsDriver(*args, **kwargs)
8+
else:
9+
return ChromeDriver(*args, **kwargs)
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
from datetime import datetime
2+
from time import sleep
3+
4+
from selenium.common import InvalidArgumentException, NoSuchElementException, TimeoutException
5+
from selenium.webdriver import ActionChains, Keys
6+
7+
from logger import get_current_logger, log_function
8+
from scrapers.scraper_component.utils.driver_consts import BrowserConsts, MainConsts
9+
from scrapers.scraper_component.interfaces.base_driver_interface import BaseDriverInterface
10+
from scrapers.scraper_component.utils.driver_utils import get_driver_path, get_temp_browser_profile_path, \
11+
create_path_if_needed, kill_browser_childes
12+
from selenium import webdriver
13+
14+
15+
class ChromeDriver(BaseDriverInterface):
16+
def __init__(self, browser_type: str = BrowserConsts.CHROME, browser_profile_path: str = None,
17+
webdriver_path: str = None, headless: bool = False, quit_at_end: bool = True):
18+
"""
19+
Constructor
20+
:param browser_profile_path: a browser profile path
21+
:param webdriver_path: selenium web driver executable path
22+
:param headless: open the browser headless
23+
:param quit_at_end: exit browser after done
24+
"""
25+
26+
# Logger
27+
self.logger = get_current_logger()
28+
29+
# Web driver path
30+
self.webdriver_path = webdriver_path if webdriver_path else get_driver_path(browser_type=browser_type)
31+
self.logger.debug(f"WebDriver path is: '{self.webdriver_path}'")
32+
33+
# Browser profile path
34+
if browser_profile_path:
35+
self.browser_profile_path = browser_profile_path
36+
else:
37+
self.browser_profile_path = get_temp_browser_profile_path(browser_type=browser_type)
38+
create_path_if_needed(path=self.browser_profile_path)
39+
self.logger.debug(f"Browser profile path is: '{self.browser_profile_path}'")
40+
41+
# Exit the window after the bot done
42+
self.teardown = quit_at_end
43+
44+
# Headless
45+
self.headless = headless
46+
47+
# Browser type
48+
self.browser_type = browser_type
49+
50+
self.__init_chrome_driver__()
51+
52+
# Implicitly wait time
53+
self._driver.implicitly_wait(MainConsts.IMPLICITLY_WAIT_TIME)
54+
55+
# Maximize the page window
56+
self._driver.maximize_window()
57+
58+
self.logger.debug(f"Initialized {self.browser_type} web driver, headless: {self.headless}")
59+
60+
def __exit__(self, exc_type, exc_val, exc_tb):
61+
"""
62+
When the bot is done his running
63+
If teardown is True, exit
64+
Else, do not exit
65+
:param exc_type:
66+
:param exc_val:
67+
:param exc_tb:
68+
:return:
69+
"""
70+
if self.teardown:
71+
self.exit()
72+
73+
@log_function
74+
def __init_chrome_driver__(self):
75+
try:
76+
options = webdriver.ChromeOptions()
77+
if self.headless:
78+
options.add_argument('--headless')
79+
options.add_argument('--no-sandbox')
80+
options.add_argument('--disable-dev-shm-usage')
81+
options.add_argument(argument=f"user-data-dir={self.browser_profile_path}")
82+
self._driver = webdriver.Chrome(executable_path=self.webdriver_path, options=options)
83+
except Exception as e:
84+
if "executable needs to be in path" in str(e).lower():
85+
self.logger.error(f"PATH Error")
86+
self.logger.error(f"Error initialize chrome driver - {str(e)}")
87+
if "chromedriver is assuming that chrome has crashed" in str(e).lower():
88+
kill_browser_childes(process_name=self.browser_type)
89+
self.logger.warning(f"Killed {self.browser_type} childes, run again")
90+
return
91+
raise e
92+
93+
@log_function
94+
def exit(self):
95+
self._driver.quit()
96+
self.logger.info(f"ChromeDriver exit")
97+
98+
@log_function
99+
def get_url(self, url: str):
100+
try:
101+
self._driver.get(url)
102+
except InvalidArgumentException:
103+
self.logger.error(f"Error getting url: '{url}' - invalid url input format, please give full correct format")
104+
self.exit()
105+
106+
@log_function
107+
def get_current_url(self) -> str:
108+
return self._driver.current_url
109+
110+
@log_function
111+
def get_title(self) -> str:
112+
return self._driver.title
113+
114+
@log_function
115+
def find_element(self, by, value):
116+
return self._driver.find_element(by=by, value=value)
117+
118+
@log_function
119+
def find_elements(self, by, value):
120+
return self._driver.find_elements(by=by, value=value)
121+
122+
@log_function
123+
def wait_until_object_appears(self, by, value, timeout: int = MainConsts.DEFAULT_ELEMENT_TIMEOUT):
124+
start_time = datetime.now()
125+
seconds_pass = 0
126+
while seconds_pass < timeout:
127+
seconds_pass = (datetime.now() - start_time).total_seconds()
128+
try:
129+
self.logger.debug(f"Waiting for element {value} to appears TIMEOUT: ({seconds_pass}/{timeout})")
130+
elements = self._driver.find_elements(by=by, value=value)
131+
if not elements:
132+
raise NoSuchElementException
133+
self.logger.info(f"Element {value} found")
134+
return
135+
except NoSuchElementException:
136+
sleep(MainConsts.ELEMENT_SLEEPING_TIME)
137+
continue
138+
raise TimeoutException
139+
140+
@log_function
141+
def insert_text(self, by, value, text: str, press_enter_needed: bool = True):
142+
try:
143+
self.click_on_element(by=by, value=value)
144+
for char in text:
145+
ActionChains(self._driver).key_down(char).key_up(char).perform()
146+
sleep(MainConsts.INSERT_TEXT_SLEEPING_TIME)
147+
self.logger.info(f"Text: '{text}' inserted to element")
148+
if press_enter_needed:
149+
ActionChains(self._driver).key_down(Keys.ENTER).key_up(Keys.ENTER).perform()
150+
self.logger.info("Enter key pressed")
151+
except Exception as e:
152+
self.logger.error(f"Error while trying to insert text: '{text}' to element: '{value}' - {str(e)}")
153+
raise e
154+
155+
@log_function
156+
def move_to_element(self, by, value):
157+
try:
158+
action = webdriver.ActionChains(self._driver)
159+
element = self._driver.find_element(by=by, value=value)
160+
action.move_to_element(element)
161+
action.perform()
162+
self.logger.info(f"Moved to element")
163+
except Exception as e:
164+
self.logger.error(f"Error while trying to move to element - {str(e)}")
165+
166+
@log_function
167+
def click_on_element(self, by, value):
168+
try:
169+
self.move_to_element(by=by, value=value)
170+
action = webdriver.ActionChains(self._driver)
171+
action.click()
172+
action.perform()
173+
self.logger.info(f"Element clicked")
174+
except Exception as e:
175+
self.logger.error(f"Error while trying to click element - {str(e)}")
176+
raise e

scrapers/scraper_component/exceptions.py

Lines changed: 0 additions & 3 deletions
This file was deleted.

scrapers/scraper_component/interfaces/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
class RequestDriverInterface:
1+
class BaseDriverInterface:
22
def get_url(self, url: str):
33
raise NotImplementedError
44

@@ -8,5 +8,8 @@ def get_current_url(self) -> str:
88
def get_title(self) -> str:
99
raise NotImplementedError
1010

11-
def find_element(self, by, value, tag_name: str = None):
11+
def find_element(self, by, value):
12+
raise NotImplementedError
13+
14+
def find_elements(self, by, value):
1215
raise NotImplementedError
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from scrapers.scraper_component.utils.driver_consts import MainConsts
2+
3+
4+
class BaseDriverInterface:
5+
def wait_until_object_appears(self, by, value, timeout: int = MainConsts.DEFAULT_ELEMENT_TIMEOUT):
6+
raise NotImplementedError
7+
8+
def insert_text(self, by, value, text: str, press_enter_needed: bool = True):
9+
raise NotImplementedError
10+
11+
def move_to_element(self, by, value):
12+
raise NotImplementedError
13+
14+
def click_on_element(self, by, value):
15+
raise NotImplementedError

scrapers/scraper_component/requests_driver.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
from bs4 import BeautifulSoup
33

44
from logger import get_current_logger
5-
from scrapers.scraper_component.exceptions import PageNotFoundException
6-
from scrapers.scraper_component.request_driver_interface import RequestDriverInterface
5+
from scrapers.scraper_component.utils.exceptions import PageNotFoundException
6+
from scrapers.scraper_component.interfaces.base_driver_interface import BaseDriverInterface
77

88
from typing import List
99

1010

11-
class RequestsDriver(RequestDriverInterface):
11+
class RequestsDriver(BaseDriverInterface):
1212
def __init__(self):
1313
self.logger = get_current_logger()
1414
self.url = None
@@ -34,8 +34,12 @@ def get_current_url(self) -> str:
3434
def get_title(self) -> str:
3535
return self._current_soup_page.title.text
3636

37-
def find_element(self, by, value, tag_name: str = None):
38-
return self._current_soup_page.find(name=tag_name, attrs={by: value})
37+
def find_element(self, by, value):
38+
# return self._current_soup_page.find(name=tag_name, attrs={by: value})
39+
pass
40+
41+
def find_elements(self, by, value):
42+
pass
3943

4044

4145
if __name__ == '__main__':

scrapers/scraper_component/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)