Skip to content

Commit bc24011

Browse files
committed
(#16)
* Create `db_utils` in `server_utils` * Create `validation_utils` in `server_utils` * Continue implement logic scraper
1 parent 3162818 commit bc24011

File tree

10 files changed

+126
-70
lines changed

10 files changed

+126
-70
lines changed

db_driver/db_objects/task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class Task:
1212
domain: str
1313
status: str
1414
type: str
15-
status_timestamp: List[Timestamp] = field(default_factory=lambda: [])
15+
status_timestamp: List[Timestamp | dict] = field(default_factory=lambda: [])
1616
creation_time: datetime.datetime = None
1717
collecting_time: datetime.datetime = None # todo: check if needed
1818

db_driver/db_objects/timestamp.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
@dataclass
77
class Timestamp:
88
status: str
9-
start_time: datetime.datetime
10-
end_time: datetime.datetime
9+
time_changed: datetime.datetime
1110

1211
def __repr__(self) -> str:
1312
string = ''

db_driver/mongodb_driver.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from db_driver.utils.exceptions import InsertDataDBException, DataNotFoundDBException, DeleteDataDBException, \
99
UpdateDataDBException
1010
from logger import get_current_logger, log_function
11-
from server_utils.db_utils import get_mongodb_connection_string
11+
from server_utils.db_utils.validation_utils import get_mongodb_connection_string
1212

1313

1414
class MongoDBDriver(DBDriverInterface):
@@ -116,7 +116,7 @@ def delete_many(self, table_name: str, data_filter: dict) -> bool:
116116
@log_function
117117
def update_one(self, table_name: str, data_filter: dict, new_data: dict) -> ObjectId:
118118
try:
119-
self.logger.debug(f"Trying to delete one data from table: '{table_name}', db: '{self.DB_NAME}'")
119+
self.logger.debug(f"Trying to update one data from table: '{table_name}', db: '{self.DB_NAME}'")
120120
res = self.__db[table_name].update_one(data_filter, {"$set": new_data})
121121
if res:
122122
object_id = res.raw_result.get('_id')
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
rf -r scapres
2+
mkdir scapres
3+
cp ../* scapres
4+
cp ../../server_utils scapres

scrapers/logic_scraper.py

Lines changed: 36 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
1-
from datetime import datetime
21
from time import sleep
32
from typing import List
4-
from uuid import uuid4
53

64
from pymongo.errors import ConnectionFailure
75

86
from db_driver import get_current_db_driver
9-
from db_driver.db_objects.db_objects_utils import get_db_object_from_dict
107
from db_driver.db_objects.task import Task
118
from db_driver.utils.consts import DBConsts
12-
from db_driver.utils.exceptions import DataNotFoundDBException, UpdateDataDBException, InsertDataDBException
13-
from logger import get_current_logger
9+
from logger import get_current_logger, log_function
1410
from scrapers import websites_scrapers_factory
1511
from scrapers.websites_scrapers.utils.consts import MainConsts
12+
from server_utils.db_utils.task_utils import TaskUtils
1613

1714

1815
class LogicScaper:
@@ -21,82 +18,57 @@ class LogicScaper:
2118
def __init__(self):
2219
self.logger = get_current_logger()
2320
self._db = get_current_db_driver()
21+
self.task_utils = TaskUtils()
2422

25-
def _get_task_by_status(self, status: str):
26-
try:
27-
task: dict = self._db.get_one(table_name=DBConsts.TASKS_TABLE_NAME, data_filter={"status": status})
28-
task_object: Task = get_db_object_from_dict(task, Task)
29-
return task_object
30-
except DataNotFoundDBException:
31-
return None
32-
33-
def _get_new_task(self) -> Task:
34-
for status in ["pending", "failed"]:
35-
task = self._get_task_by_status(status=status)
36-
if task:
37-
return task
38-
39-
def _update_task_status(self, task_id: str, status: str):
40-
try:
41-
data_filter = {"task_id": task_id}
42-
new_data = {"status": status}
43-
self._db.update_one(table_name=DBConsts.TASKS_TABLE_NAME, data_filter=data_filter, new_data=new_data)
44-
except UpdateDataDBException as e:
45-
desc = f"Error updating task as `running`"
46-
self.logger.error(desc)
47-
raise e
48-
23+
@log_function
4924
def _filter_only_not_exits_articles(self, urls: List[str]) -> List[str]:
5025
data_filter = {"url": {"$in": urls}}
5126
exists_articles = self._db.get_many(table_name=DBConsts.ARTICLE_TABLE_NAME, data_filter=data_filter)
5227
exists_articles_urls = {exists_article.get("url") for exists_article in exists_articles}
5328
new_articles = list(set(urls).difference(exists_articles_urls))
5429
return new_articles
5530

56-
def _create_new_task(self, url: str, domain: str):
57-
for trie in range(MainConsts.TIMES_TRY_CREATE_TASK):
31+
@log_function
32+
def _create_collecting_article_tasks(self, urls: List[str], domain: str):
33+
for url in urls:
5834
try:
59-
task_data = {
60-
"task_id": str(uuid4()),
61-
"url": url,
62-
"domain": domain,
63-
"status": "pending",
64-
"type": MainConsts.COLLECT_ARTICLE,
65-
"creation_time": datetime.now()
66-
}
67-
new_task: dict = Task(**task_data).convert_to_dict()
68-
inserted_id = self._db.insert_one(table_name=DBConsts.TASKS_TABLE_NAME, data=new_task)
69-
self.logger.info(f"Created new task inserted_id: {inserted_id}")
70-
return
35+
self.task_utils.create_new_task(url=url, domain=domain, task_type=MainConsts.COLLECT_ARTICLE)
7136
except Exception as e:
72-
self.logger.warning(f"Error create new task NO. {trie}/{MainConsts.TIMES_TRY_CREATE_TASK} - {str(e)}")
73-
continue
74-
desc = f"Error creating new task into db after {MainConsts.TIMES_TRY_CREATE_TASK} tries"
75-
raise InsertDataDBException(desc)
37+
desc = f"Error creating new task with type: {MainConsts.COLLECT_ARTICLE} - {str(e)}"
38+
self.logger.error(desc)
7639

77-
def _handle_task(self, task: Task):
78-
if task.type == MainConsts.COLLECT_URLS:
79-
website_scraper = websites_scrapers_factory(scraper_name=task.domain)
80-
urls = website_scraper.get_new_article_urls_from_home_page()
81-
urls = self._filter_only_not_exits_articles(urls=urls)
82-
for url in urls:
83-
try:
84-
self._create_new_task(url=url, domain=task.domain)
85-
except Exception as e:
86-
desc = f"Error creating new task with type: {MainConsts.COLLECT_ARTICLE} - {str(e)}"
87-
self.logger.error(desc)
88-
elif task.type == MainConsts.COLLECT_ARTICLE:
89-
pass
40+
@log_function
41+
def _handle_task(self, task: Task) -> bool:
42+
try:
43+
if task.type == MainConsts.COLLECT_URLS:
44+
website_scraper = websites_scrapers_factory(scraper_name=task.domain)
45+
urls = website_scraper.get_new_article_urls_from_home_page()
46+
urls = self._filter_only_not_exits_articles(urls=urls)
47+
self._create_collecting_article_tasks(urls=urls, domain=task.domain)
48+
self.logger.info(f"Done handle task of type: `{task.type}`")
49+
elif task.type == MainConsts.COLLECT_ARTICLE:
50+
pass
51+
except Exception as e:
52+
desc = f"Failed run task task_id: `{task.task_id}`, type: `{task.type}` - {str(e)}"
53+
self.logger.error(desc)
54+
return False
55+
return True
9056

57+
@log_function
9158
def run(self):
9259
while True:
9360
try:
94-
task = self._get_new_task()
61+
task = self.task_utils.get_new_task()
9562
if task:
96-
self._update_task_status(task_id=task.task_id, status="running")
97-
self._handle_task(task=task)
63+
self.logger = get_current_logger(task_id=task.task_id, task_type=task.type)
64+
self.task_utils.update_task_status(task=task, status="running")
65+
is_task_succeeded = self._handle_task(task=task)
66+
if is_task_succeeded:
67+
self.task_utils.update_task_status(task=task, status="succeeded")
68+
else:
69+
self.task_utils.update_task_status(task=task, status="failed")
9870
else:
99-
self.logger.debug(f"Couldn't find task, sleeping for {self.SLEEPING_TIME / 60} minutes")
71+
self.logger.warning(f"Couldn't find task, sleeping for {self.SLEEPING_TIME / 60} minutes")
10072
sleep(self.SLEEPING_TIME)
10173
except ConnectionFailure as e:
10274
self.logger.warning(f"Error connecting to db, initialize the db again - {str(e)}")

scrapers/websites_scrapers/utils/consts.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ class ScraperConsts:
88
class MainConsts:
99
COLLECT_URLS = "collect_urls"
1010
COLLECT_ARTICLE = "collect_article"
11-
TIMES_TRY_CREATE_TASK = int(os.getenv(key="TIMES_TRY_CREATE_TASK", default=3))
1211
TIMES_TRY_GET_HOMEPAGE = int(os.getenv(key="TIMES_TO_TRY_GET_HOMEPAGE", default=3))

server_utils/db_utils/__init__.py

Whitespace-only changes.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import os
2+
3+
4+
class TaskConsts:
5+
TIMES_TRY_CREATE_TASK = int(os.getenv(key="TIMES_TRY_CREATE_TASK", default=3))
6+
TASKS_TABLE_NAME = "tasks"
7+
8+
9+
class ArticleConsts:
10+
pass
11+
12+
13+
class ClusterConsts:
14+
pass

server_utils/db_utils/task_utils.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from datetime import datetime
2+
from uuid import uuid4
3+
4+
from db_driver import get_current_db_driver
5+
from db_driver.db_objects.db_objects_utils import get_db_object_from_dict
6+
from db_driver.db_objects.task import Task
7+
from db_driver.db_objects.timestamp import Timestamp
8+
from db_driver.utils.exceptions import InsertDataDBException, UpdateDataDBException, DataNotFoundDBException
9+
from logger import get_current_logger, log_function
10+
from server_utils.db_utils.db_utils_consts import TaskConsts
11+
12+
13+
class TaskUtils:
14+
def __init__(self):
15+
self.logger = get_current_logger()
16+
self._db = get_current_db_driver()
17+
18+
@log_function
19+
def create_new_task(self, url: str, domain: str, task_type: str):
20+
for trie in range(TaskConsts.TIMES_TRY_CREATE_TASK):
21+
try:
22+
task_data = {
23+
"task_id": str(uuid4()),
24+
"url": url,
25+
"domain": domain,
26+
"status": "pending",
27+
"type": task_type,
28+
"status_timestamp": [Timestamp(status="pending", time_changed=datetime.now())],
29+
"creation_time": datetime.now()
30+
}
31+
new_task: dict = Task(**task_data).convert_to_dict()
32+
inserted_id = self._db.insert_one(table_name=TaskConsts.TASKS_TABLE_NAME, data=new_task)
33+
self.logger.info(f"Created new task inserted_id: {inserted_id}")
34+
return
35+
except Exception as e:
36+
self.logger.warning(f"Error create new task NO. {trie}/{TaskConsts.TIMES_TRY_CREATE_TASK} - {str(e)}")
37+
continue
38+
desc = f"Error creating new task into db after {TaskConsts.TIMES_TRY_CREATE_TASK} tries"
39+
raise InsertDataDBException(desc)
40+
41+
@log_function
42+
def update_task_status(self, task: Task, status: str):
43+
try:
44+
data_filter = {"task_id": task.task_id}
45+
new_timestamp = Timestamp(status=status, time_changed=datetime.now())
46+
task.status_timestamp.append(new_timestamp.convert_to_dict())
47+
new_data = {"status": status, "status_timestamp": task.status_timestamp}
48+
self._db.update_one(table_name=TaskConsts.TASKS_TABLE_NAME, data_filter=data_filter, new_data=new_data)
49+
except UpdateDataDBException as e:
50+
desc = f"Error updating task task_id: `{task.task_id}` as status: `{status}`"
51+
self.logger.error(desc)
52+
raise e
53+
54+
@log_function
55+
def _get_task_by_status(self, status: str):
56+
try:
57+
task: dict = self._db.get_one(table_name=TaskConsts.TASKS_TABLE_NAME, data_filter={"status": status})
58+
task_object: Task = get_db_object_from_dict(task, Task)
59+
return task_object
60+
except DataNotFoundDBException:
61+
return None
62+
63+
@log_function
64+
def get_new_task(self) -> Task:
65+
for status in ["pending", "failed"]:
66+
task = self._get_task_by_status(status=status)
67+
if task:
68+
return task
File renamed without changes.

0 commit comments

Comments
 (0)