Skip to content

Commit 3162818

Browse files
committed
(#16)
* Start implement `bbc` scraper and logic_scraper * Update scrapers README.md * Add `Task` object
1 parent 7556566 commit 3162818

File tree

11 files changed

+288
-39
lines changed

11 files changed

+288
-39
lines changed

db_driver/db_objects/task.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import datetime
2+
from dataclasses import dataclass, asdict, field
3+
from typing import List, Optional
4+
5+
from db_driver.db_objects.timestamp import Timestamp
6+
7+
8+
@dataclass
9+
class Task:
10+
task_id: str
11+
url: str
12+
domain: str
13+
status: str
14+
type: str
15+
status_timestamp: List[Timestamp] = field(default_factory=lambda: [])
16+
creation_time: datetime.datetime = None
17+
collecting_time: datetime.datetime = None # todo: check if needed
18+
19+
def __repr__(self) -> str:
20+
string = ''
21+
for prop, value in vars(self).items():
22+
string += f"{str(prop)}: {str(value)}\n"
23+
return string
24+
25+
def convert_to_dict(self) -> dict:
26+
return asdict(self)

db_driver/db_objects/timestamp.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import datetime
2+
from dataclasses import dataclass, asdict
3+
from typing import List, Optional
4+
5+
6+
@dataclass
7+
class Timestamp:
8+
status: str
9+
start_time: datetime.datetime
10+
end_time: datetime.datetime
11+
12+
def __repr__(self) -> str:
13+
string = ''
14+
for prop, value in vars(self).items():
15+
string += f"{str(prop)}: {str(value)}\n"
16+
return string
17+
18+
def convert_to_dict(self) -> dict:
19+
return asdict(self)

db_driver/mongodb_driver.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,18 @@ def delete_many(self, table_name: str, data_filter: dict) -> bool:
117117
def update_one(self, table_name: str, data_filter: dict, new_data: dict) -> ObjectId:
118118
try:
119119
self.logger.debug(f"Trying to delete one data from table: '{table_name}', db: '{self.DB_NAME}'")
120-
res = self.__db[table_name].update_one(data_filter, new_data)
120+
res = self.__db[table_name].update_one(data_filter, {"$set": new_data})
121121
if res:
122122
object_id = res.raw_result.get('_id')
123123
self.logger.info(
124124
f"updated one data from db: '{self.DB_NAME}', table_name: '{table_name}', id: '{object_id}'")
125125
return object_id
126126
else:
127-
desc = f"Error delete data with filter: {data_filter}, table: '{table_name}, db: {self.DB_NAME}'"
127+
desc = f"Error update data with filter: {data_filter}, table: '{table_name}, db: {self.DB_NAME}'"
128128
self.logger.error(desc)
129-
raise DeleteDataDBException(desc)
129+
raise UpdateDataDBException(desc)
130130
except Exception as e:
131-
self.logger.error(f"Error delete one from db: {str(e)}")
131+
self.logger.error(f"Error update one from db: {str(e)}")
132132
raise e
133133

134134
@log_function

db_driver/utils/consts.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
class DBConsts:
2+
TASKS_TABLE_NAME = "tasks"
3+
ARTICLE_TABLE_NAME = "articles"

scrapers/README.md

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Scraper Component
22

3-
> ### <i>Scraper Flow</i>
3+
> ## <i>Scraper Flow</i>
44
55
```mermaid
66
graph TD;
@@ -10,17 +10,26 @@ a{ scraping-task } -->scrape-new-articles-urls
1010
scrape-new-articles-urls --> scrape-articles-content;
1111
```
1212

13-
> ### <i>Running Scraping Task</i>
13+
> ## <i>Running Scraping Task</i>
1414
15-
1. get `pending` collecting task from db
16-
2. get website scraper instance using factory
17-
3. init scraper driver
18-
4. get url page
19-
5. get urls list
20-
6. filter only new urls
21-
7. for each url:
22-
1. get to url page
23-
2. collect article content
24-
3. save to db
25-
4. update task list of collected articles *
26-
8. update task as `succeeded` or `failed`
15+
### Type of task - `collect_urls`
16+
17+
1. get `pending` collecting task from db - if not found pending - get by `failed` status
18+
2. set task status as `running`
19+
3. get website scraper instance using factory
20+
4. init scraper driver
21+
5. get url page
22+
6. get urls list
23+
7. filter only new urls
24+
8. for each new url - create new task for collecting content
25+
26+
### Type of task - `collect_article`
27+
28+
1. get `pending` collecting task from db - if not found pending - get by `failed` status
29+
2. set task status as `running`
30+
3. get website scraper instance using factory
31+
4. init scraper driver
32+
5. get url page (article)
33+
6. collect article content
34+
7. save to db
35+
8. update task as `succeeded` or `failed`

scrapers/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from logger import get_current_logger
2+
from scrapers.websites_scrapers.bbc_scraper import BBCScraper
23
from scrapers.websites_scrapers.utils.exceptions import UnknownWebsiteScraperException
34
from scrapers.websites_scrapers.website_scraper_base import WebsiteScraperBase
45

5-
SCRAPERS = {} # example: "bbc": BBCWebsiteScraper
6+
SCRAPERS = {"bbc": BBCScraper} # example: "bbc": BBCWebsiteScraper
67

78

89
def websites_scrapers_factory(scraper_name: str, *args, **kwargs) -> WebsiteScraperBase:
@@ -17,7 +18,7 @@ def websites_scrapers_factory(scraper_name: str, *args, **kwargs) -> WebsiteScra
1718
try:
1819
return SCRAPERS[scraper_name](*args, **kwargs)
1920
except KeyError:
20-
desc = f"Cannot find scraper name: `{scraper_name}` in {SCRAPERS}"
21+
desc = f"Cannot find scraper name: `{scraper_name}` in {SCRAPERS.keys()}"
2122
logger.error(desc)
2223
raise UnknownWebsiteScraperException(desc)
2324
except Exception as e:

scrapers/logic_scraper.py

Lines changed: 107 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,110 @@
1+
from datetime import datetime
2+
from time import sleep
3+
from typing import List
4+
from uuid import uuid4
5+
6+
from pymongo.errors import ConnectionFailure
7+
8+
from db_driver import get_current_db_driver
9+
from db_driver.db_objects.db_objects_utils import get_db_object_from_dict
10+
from db_driver.db_objects.task import Task
11+
from db_driver.utils.consts import DBConsts
12+
from db_driver.utils.exceptions import DataNotFoundDBException, UpdateDataDBException, InsertDataDBException
13+
from logger import get_current_logger
14+
from scrapers import websites_scrapers_factory
15+
from scrapers.websites_scrapers.utils.consts import MainConsts
16+
17+
118
class LogicScaper:
2-
pass
19+
SLEEPING_TIME = 60 * 15
20+
21+
def __init__(self):
22+
self.logger = get_current_logger()
23+
self._db = get_current_db_driver()
24+
25+
def _get_task_by_status(self, status: str):
26+
try:
27+
task: dict = self._db.get_one(table_name=DBConsts.TASKS_TABLE_NAME, data_filter={"status": status})
28+
task_object: Task = get_db_object_from_dict(task, Task)
29+
return task_object
30+
except DataNotFoundDBException:
31+
return None
32+
33+
def _get_new_task(self) -> Task:
34+
for status in ["pending", "failed"]:
35+
task = self._get_task_by_status(status=status)
36+
if task:
37+
return task
38+
39+
def _update_task_status(self, task_id: str, status: str):
40+
try:
41+
data_filter = {"task_id": task_id}
42+
new_data = {"status": status}
43+
self._db.update_one(table_name=DBConsts.TASKS_TABLE_NAME, data_filter=data_filter, new_data=new_data)
44+
except UpdateDataDBException as e:
45+
desc = f"Error updating task as `running`"
46+
self.logger.error(desc)
47+
raise e
48+
49+
def _filter_only_not_exits_articles(self, urls: List[str]) -> List[str]:
50+
data_filter = {"url": {"$in": urls}}
51+
exists_articles = self._db.get_many(table_name=DBConsts.ARTICLE_TABLE_NAME, data_filter=data_filter)
52+
exists_articles_urls = {exists_article.get("url") for exists_article in exists_articles}
53+
new_articles = list(set(urls).difference(exists_articles_urls))
54+
return new_articles
55+
56+
def _create_new_task(self, url: str, domain: str):
57+
for trie in range(MainConsts.TIMES_TRY_CREATE_TASK):
58+
try:
59+
task_data = {
60+
"task_id": str(uuid4()),
61+
"url": url,
62+
"domain": domain,
63+
"status": "pending",
64+
"type": MainConsts.COLLECT_ARTICLE,
65+
"creation_time": datetime.now()
66+
}
67+
new_task: dict = Task(**task_data).convert_to_dict()
68+
inserted_id = self._db.insert_one(table_name=DBConsts.TASKS_TABLE_NAME, data=new_task)
69+
self.logger.info(f"Created new task inserted_id: {inserted_id}")
70+
return
71+
except Exception as e:
72+
self.logger.warning(f"Error create new task NO. {trie}/{MainConsts.TIMES_TRY_CREATE_TASK} - {str(e)}")
73+
continue
74+
desc = f"Error creating new task into db after {MainConsts.TIMES_TRY_CREATE_TASK} tries"
75+
raise InsertDataDBException(desc)
76+
77+
def _handle_task(self, task: Task):
78+
if task.type == MainConsts.COLLECT_URLS:
79+
website_scraper = websites_scrapers_factory(scraper_name=task.domain)
80+
urls = website_scraper.get_new_article_urls_from_home_page()
81+
urls = self._filter_only_not_exits_articles(urls=urls)
82+
for url in urls:
83+
try:
84+
self._create_new_task(url=url, domain=task.domain)
85+
except Exception as e:
86+
desc = f"Error creating new task with type: {MainConsts.COLLECT_ARTICLE} - {str(e)}"
87+
self.logger.error(desc)
88+
elif task.type == MainConsts.COLLECT_ARTICLE:
89+
pass
390

491
def run(self):
5-
pass
6-
# 1. get `pending` collecting task from db
7-
# 2. get website scraper instance using factory
8-
# 3. init scraper driver
9-
# 4. get url page
10-
# 5. get urls list
11-
# 6. filter only new urls
12-
# 7. for each url:
13-
# 1. get to url page
14-
# 2. collect article content
15-
# 3. save to db
16-
# 4. update task list of collected articles *
17-
# 8. update task as `succeeded` or `failed`
92+
while True:
93+
try:
94+
task = self._get_new_task()
95+
if task:
96+
self._update_task_status(task_id=task.task_id, status="running")
97+
self._handle_task(task=task)
98+
else:
99+
self.logger.debug(f"Couldn't find task, sleeping for {self.SLEEPING_TIME / 60} minutes")
100+
sleep(self.SLEEPING_TIME)
101+
except ConnectionFailure as e:
102+
self.logger.warning(f"Error connecting to db, initialize the db again - {str(e)}")
103+
self._db = get_current_db_driver()
104+
except Exception as e:
105+
self.logger.warning(f"Error handle task - {str(e)}")
106+
107+
108+
if __name__ == '__main__':
109+
logic_scraper = LogicScaper()
110+
logic_scraper.run()
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os
2+
from datetime import datetime
3+
from typing import List
4+
5+
from selenium.webdriver.common.by import By
6+
7+
from db_driver.db_objects.article import Article
8+
from logger import get_current_logger, log_function
9+
from scrapers.websites_scrapers.website_scraper_base import WebsiteScraperBase
10+
from scrapers.scraper_drivers import get_scraping_driver
11+
from scrapers.websites_scrapers.utils.consts import ScraperConsts, MainConsts
12+
from scrapers.websites_scrapers.utils.exceptions import FailedGetURLException
13+
14+
15+
class BBCScraper(WebsiteScraperBase):
16+
USE_REQUEST_DRIVER = bool(os.getenv(key="USE_REQUEST_DRIVER", default=True))
17+
HEADLESS = bool(os.getenv(key="HEADLESS", default=True))
18+
19+
def __init__(self):
20+
self.logger = get_current_logger()
21+
self._driver = get_scraping_driver(via_request=self.USE_REQUEST_DRIVER, headless=self.HEADLESS)
22+
self._url = ScraperConsts.BBC_HOME_PAGE
23+
24+
@log_function
25+
def _get_home_page(self):
26+
exception = None
27+
for trie in range(MainConsts.TIMES_TRY_GET_HOMEPAGE):
28+
try:
29+
self._driver.get_url(url=self._url)
30+
self.logger.info(f"Successfully get home page -> `{self._url}`")
31+
return
32+
except Exception as e:
33+
exception = e
34+
desc = f"Cannot get into home page try NO. {trie + 1}/{MainConsts.TIMES_TRY_GET_HOMEPAGE} - {str(e)}"
35+
self.logger.warning(desc)
36+
desc = f"Failed get home page -> {self._url} after {MainConsts.TIMES_TRY_GET_HOMEPAGE} tries - {exception}"
37+
self.logger.error(desc)
38+
raise FailedGetURLException(desc)
39+
40+
def _get_article_page(self, url: str):
41+
raise NotImplementedError
42+
43+
def _get_article_title(self) -> str:
44+
raise NotImplementedError
45+
46+
def _get_article_content_text(self) -> str:
47+
raise NotImplementedError
48+
49+
def _get_article_publishing_time(self) -> datetime:
50+
raise NotImplementedError
51+
52+
def _get_article_category(self) -> str:
53+
# default return - 'general'
54+
raise NotImplementedError
55+
56+
def _get_article_image_urls(self) -> List[str]:
57+
# default return - empty list
58+
raise NotImplementedError
59+
60+
def _get_article_state(self) -> str:
61+
# default return - 'global'
62+
raise NotImplementedError
63+
64+
def get_new_article_urls_from_home_page(self) -> List[str]:
65+
self._get_home_page()
66+
articles_urls = []
67+
articles_elements = self._driver.find_elements(by=By.CLASS_NAME, value="block-link__overlay-link")
68+
for element in articles_elements:
69+
href = element.get_attribute("href")
70+
if self._url not in href:
71+
href = self._url + href
72+
articles_urls.append(href)
73+
return articles_urls
74+
75+
def get_article(self, url: str) -> Article:
76+
raise NotImplementedError
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import os
2+
3+
4+
class ScraperConsts:
5+
BBC_HOME_PAGE = "https://www.bbc.com/"
6+
7+
8+
class MainConsts:
9+
COLLECT_URLS = "collect_urls"
10+
COLLECT_ARTICLE = "collect_article"
11+
TIMES_TRY_CREATE_TASK = int(os.getenv(key="TIMES_TRY_CREATE_TASK", default=3))
12+
TIMES_TRY_GET_HOMEPAGE = int(os.getenv(key="TIMES_TO_TRY_GET_HOMEPAGE", default=3))
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
class UnknownWebsiteScraperException(Exception):
22
def __init__(self, msg: str):
33
self.msg = msg
4+
5+
6+
class FailedGetURLException(Exception):
7+
def __init__(self, msg: str):
8+
self.msg = msg

0 commit comments

Comments
 (0)