Skip to content

Commit d4d330c

Browse files
committed
Done implementing base of requests_driver.py and chrome_driver.py (#6) (#10)
1 parent ef7b2b8 commit d4d330c

File tree

9 files changed

+316
-69
lines changed

9 files changed

+316
-69
lines changed

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ selenium~=4.5.0
33
setuptools==65.5.1
44
requests~=2.28.2
55
bs4~=0.0.1
6-
beautifulsoup4~=4.12.0
6+
beautifulsoup4~=4.12.0
7+
lxml~=4.9.2

scrapers/scraper_component/chrome_driver.py

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from datetime import datetime
22
from time import sleep
3+
from typing import List
34

4-
from selenium.common import InvalidArgumentException, NoSuchElementException, TimeoutException
5+
from selenium.common import InvalidArgumentException, NoSuchElementException, TimeoutException, WebDriverException
56
from selenium.webdriver import ActionChains, Keys
7+
from selenium.webdriver.chrome.options import Options
68

79
from logger import get_current_logger, log_function
810
from scrapers.scraper_component.utils.driver_consts import BrowserConsts, MainConsts
@@ -11,6 +13,8 @@
1113
create_path_if_needed, kill_browser_childes
1214
from selenium import webdriver
1315

16+
from scrapers.scraper_component.utils.element import Element
17+
1418

1519
class ChromeDriver(BaseDriverInterface):
1620
def __init__(self, browser_type: str = BrowserConsts.CHROME, browser_profile_path: str = None,
@@ -73,13 +77,19 @@ def __exit__(self, exc_type, exc_val, exc_tb):
7377
@log_function
7478
def __init_chrome_driver__(self):
7579
try:
76-
options = webdriver.ChromeOptions()
80+
chrome_options = Options()
81+
chrome_options.add_argument('--no-sandbox')
82+
chrome_options.add_argument('--disable-dev-shm-usage')
83+
chrome_options.add_argument(argument=f"user-data-dir={self.browser_profile_path}")
7784
if self.headless:
78-
options.add_argument('--headless')
79-
options.add_argument('--no-sandbox')
80-
options.add_argument('--disable-dev-shm-usage')
81-
options.add_argument(argument=f"user-data-dir={self.browser_profile_path}")
82-
self._driver = webdriver.Chrome(executable_path=self.webdriver_path, options=options)
85+
chrome_options.add_argument("--headless")
86+
chrome_options.add_argument('start-maximized')
87+
chrome_options.add_argument('disable-infobars')
88+
chrome_options.add_argument("--disable-extensions")
89+
start_time = datetime.now()
90+
self._driver = webdriver.Chrome(executable_path=self.webdriver_path, options=chrome_options)
91+
end_time = datetime.now()
92+
self.logger.info(f"Init chrome driver in {(end_time - start_time).total_seconds()} seconds")
8393
except Exception as e:
8494
if "executable needs to be in path" in str(e).lower():
8595
self.logger.error(f"PATH Error")
@@ -93,31 +103,49 @@ def __init_chrome_driver__(self):
93103
@log_function
94104
def exit(self):
95105
self._driver.quit()
96-
self.logger.info(f"ChromeDriver exit")
106+
self.logger.info(f"Exit Chrome Driver")
97107

98108
@log_function
99109
def get_url(self, url: str):
100-
try:
101-
self._driver.get(url)
102-
except InvalidArgumentException:
103-
self.logger.error(f"Error getting url: '{url}' - invalid url input format, please give full correct format")
104-
self.exit()
110+
for trie in range(MainConsts.GET_URL_TRIES):
111+
try:
112+
self.logger.debug(f"Trying to get page url: `{url}` NO. {trie + 1}/{MainConsts.GET_URL_TRIES}")
113+
self._driver.get(url)
114+
self.logger.info(f"Get to page url: `{url}`")
115+
return
116+
except InvalidArgumentException:
117+
desc = f"Error getting url: '{url}' - invalid url input format, please give full correct format"
118+
self.__error_and_exit(desc)
119+
except WebDriverException as e:
120+
if "ERR_CONNECTION_RESET" in str(e):
121+
continue
122+
desc = f"Error getting to page url: `{url}` - {str(e)}"
123+
self.__error_and_exit(desc)
124+
except Exception as e:
125+
desc = f"Error getting to page url: `{url}` - {str(e)}"
126+
self.__error_and_exit(desc)
127+
self.__error_and_exit(f"Error getting to page url: `{url}` after {MainConsts.GET_URL_TRIES} tries")
128+
129+
@log_function
130+
def __error_and_exit(self, desc):
131+
self.logger.error(desc)
132+
self.exit()
105133

106134
@log_function
107135
def get_current_url(self) -> str:
108-
return self._driver.current_url
136+
return self._driver.current_url if self._driver.current_url not in BrowserConsts.NEW_TAB_URLS else None
109137

110138
@log_function
111139
def get_title(self) -> str:
112-
return self._driver.title
140+
return self._driver.title if self._driver.title != BrowserConsts.NEW_TAB_TITLE and self._driver.title else None
113141

114142
@log_function
115-
def find_element(self, by, value):
116-
return self._driver.find_element(by=by, value=value)
143+
def find_element(self, by, value) -> Element:
144+
return Element(read_element=self._driver.find_element(by=by, value=value))
117145

118146
@log_function
119-
def find_elements(self, by, value):
120-
return self._driver.find_elements(by=by, value=value)
147+
def find_elements(self, by, value) -> List[Element]:
148+
return [Element(read_element=element) for element in self._driver.find_elements(by=by, value=value)]
121149

122150
@log_function
123151
def wait_until_object_appears(self, by, value, timeout: int = MainConsts.DEFAULT_ELEMENT_TIMEOUT):

scrapers/scraper_component/interfaces/base_driver_interface.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
from typing import List
2+
3+
from scrapers.scraper_component.utils.element import Element
4+
5+
16
class BaseDriverInterface:
27
def get_url(self, url: str):
38
raise NotImplementedError
@@ -8,8 +13,11 @@ def get_current_url(self) -> str:
813
def get_title(self) -> str:
914
raise NotImplementedError
1015

11-
def find_element(self, by, value):
16+
def find_element(self, by, value) -> Element:
17+
raise NotImplementedError
18+
19+
def find_elements(self, by, value) -> List[Element]:
1220
raise NotImplementedError
1321

14-
def find_elements(self, by, value):
22+
def exit(self):
1523
raise NotImplementedError
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
class ElementInterface:
2+
def get_text(self) -> str:
3+
raise NotImplementedError
4+
5+
def get_tag_name(self) -> str:
6+
raise NotImplementedError
7+
8+
def get_attribute(self, attribute: str) -> str:
9+
raise NotImplementedError
10+
11+
def is_hidden(self) -> bool:
12+
raise NotImplementedError
Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,105 @@
11
import requests
22
from bs4 import BeautifulSoup
3+
from selenium.common import NoSuchElementException
4+
from selenium.webdriver.common.by import By
35

4-
from logger import get_current_logger
5-
from scrapers.scraper_component.utils.exceptions import PageNotFoundException
6+
from logger import get_current_logger, log_function
7+
from scrapers.scraper_component.utils.driver_consts import MainConsts
8+
from scrapers.scraper_component.utils.element import Element
9+
from scrapers.scraper_component.utils.exceptions import PageNotFoundException, AttributeNameException
610
from scrapers.scraper_component.interfaces.base_driver_interface import BaseDriverInterface
711

12+
from urllib.request import urlopen
13+
from lxml import etree
14+
815
from typing import List
916

1017

1118
class RequestsDriver(BaseDriverInterface):
12-
def __init__(self):
19+
def __init__(self, headless: bool = False):
1320
self.logger = get_current_logger()
1421
self.url = None
1522
self._current_soup_page = None
23+
self.headless = headless
1624

25+
@log_function
26+
def exit(self):
27+
self.url = None
28+
self._current_soup_page = None
29+
self.logger.info(f"Exit Request Driver")
30+
31+
@log_function
1732
def _get_page_tag_names(self) -> List[str]:
1833
return list({tag.name for tag in self._current_soup_page.findAll()})
1934

35+
@log_function
2036
def get_url(self, url: str):
2137
try:
22-
page_res = requests.get(url)
23-
self._current_soup_page = BeautifulSoup(page_res.text, 'html.parser')
24-
self.url = page_res.url
25-
self.logger.info(f"Get to page url: `{self.url}`")
38+
for trie in range(MainConsts.GET_URL_TRIES):
39+
self.logger.debug(f"Trying to get page url: `{url}` NO. {trie + 1}/{MainConsts.GET_URL_TRIES}")
40+
page_res = requests.get(url, timeout=MainConsts.REQUEST_TIMEOUT)
41+
if page_res.status_code == 200:
42+
self._current_soup_page = BeautifulSoup(page_res.text, 'html.parser')
43+
self.url = page_res.url
44+
self.logger.info(f"Get to page url: `{url}`")
45+
return
46+
raise PageNotFoundException(f"Error getting page url: `{self.url}` after {MainConsts.GET_URL_TRIES} tries")
2647
except Exception as e:
2748
desc = f"Error getting url: `{url}` - `{e}`"
2849
self.logger.error(desc)
2950
raise PageNotFoundException(desc)
3051

52+
@log_function
3153
def get_current_url(self) -> str:
3254
return self.url
3355

56+
@log_function
3457
def get_title(self) -> str:
35-
return self._current_soup_page.title.text
36-
37-
def find_element(self, by, value):
38-
# return self._current_soup_page.find(name=tag_name, attrs={by: value})
39-
pass
58+
return self._current_soup_page.title.text if self._current_soup_page else None
4059

41-
def find_elements(self, by, value):
42-
pass
60+
@log_function
61+
def find_element(self, by, value) -> Element:
62+
element = None
63+
try:
64+
if by == By.ID:
65+
element = self._current_soup_page.find(attrs={"id": value})
66+
elif by == By.CLASS_NAME:
67+
element = self._current_soup_page.find(attrs={"class": value})
68+
elif by == By.XPATH:
69+
htmlparser = etree.HTMLParser()
70+
response = urlopen(self.url)
71+
tree = etree.parse(response, htmlparser)
72+
element = tree.xpath(value)[0]
73+
else:
74+
raise AttributeNameException(f"Cannot find element by: `{by}`")
75+
except Exception as e:
76+
self.__raise_no_such_element_exception(by=by, value=value, exception=e)
77+
if element is None:
78+
self.__raise_no_such_element_exception(by=by, value=value, exception=NoSuchElementException)
79+
return Element(read_element=element)
4380

81+
def __raise_no_such_element_exception(self, by, value, exception):
82+
desc = f"Cannot find element by: `{by}` with value: `{value}` - {str(exception)}"
83+
self.logger.error(desc)
84+
raise NoSuchElementException(desc)
4485

45-
if __name__ == '__main__':
46-
rd = RequestsDriver()
47-
rd.get_url("https://www.bbc.com/")
48-
element = rd.find_element(by="class", value="block-link__overlay-link")
49-
print(element)
86+
@log_function
87+
def find_elements(self, by, value) -> List[Element]:
88+
elements = None
89+
try:
90+
if by == By.ID:
91+
elements = self._current_soup_page.findAll(attrs={"id": value})
92+
elif by == By.CLASS_NAME:
93+
elements = self._current_soup_page.findAll(attrs={"class": value})
94+
elif by == By.XPATH:
95+
htmlparser = etree.HTMLParser()
96+
response = urlopen(self.url)
97+
tree = etree.parse(response, htmlparser)
98+
elements = tree.xpath(value)
99+
else:
100+
raise AttributeNameException(f"Cannot find element by: `{by}`")
101+
except Exception as e:
102+
self.__raise_no_such_element_exception(by=by, value=value, exception=e)
103+
if elements is None:
104+
return list()
105+
return [Element(read_element=element) for element in elements]

0 commit comments

Comments
 (0)