1
- from datetime import datetime
2
1
from time import sleep
3
2
from typing import List
4
- from uuid import uuid4
5
3
6
4
from pymongo .errors import ConnectionFailure
7
5
8
6
from db_driver import get_current_db_driver
9
- from db_driver .db_objects .db_objects_utils import get_db_object_from_dict
10
7
from db_driver .db_objects .task import Task
11
8
from db_driver .utils .consts import DBConsts
12
- from db_driver .utils .exceptions import DataNotFoundDBException , UpdateDataDBException , InsertDataDBException
13
- from logger import get_current_logger
9
+ from logger import get_current_logger , log_function
14
10
from scrapers import websites_scrapers_factory
15
11
from scrapers .websites_scrapers .utils .consts import MainConsts
12
+ from server_utils .db_utils .task_utils import TaskUtils
16
13
17
14
18
15
class LogicScaper :
@@ -21,82 +18,57 @@ class LogicScaper:
21
18
def __init__ (self ):
22
19
self .logger = get_current_logger ()
23
20
self ._db = get_current_db_driver ()
21
+ self .task_utils = TaskUtils ()
24
22
25
- def _get_task_by_status (self , status : str ):
26
- try :
27
- task : dict = self ._db .get_one (table_name = DBConsts .TASKS_TABLE_NAME , data_filter = {"status" : status })
28
- task_object : Task = get_db_object_from_dict (task , Task )
29
- return task_object
30
- except DataNotFoundDBException :
31
- return None
32
-
33
- def _get_new_task (self ) -> Task :
34
- for status in ["pending" , "failed" ]:
35
- task = self ._get_task_by_status (status = status )
36
- if task :
37
- return task
38
-
39
- def _update_task_status (self , task_id : str , status : str ):
40
- try :
41
- data_filter = {"task_id" : task_id }
42
- new_data = {"status" : status }
43
- self ._db .update_one (table_name = DBConsts .TASKS_TABLE_NAME , data_filter = data_filter , new_data = new_data )
44
- except UpdateDataDBException as e :
45
- desc = f"Error updating task as `running`"
46
- self .logger .error (desc )
47
- raise e
48
-
23
+ @log_function
49
24
def _filter_only_not_exits_articles (self , urls : List [str ]) -> List [str ]:
50
25
data_filter = {"url" : {"$in" : urls }}
51
26
exists_articles = self ._db .get_many (table_name = DBConsts .ARTICLE_TABLE_NAME , data_filter = data_filter )
52
27
exists_articles_urls = {exists_article .get ("url" ) for exists_article in exists_articles }
53
28
new_articles = list (set (urls ).difference (exists_articles_urls ))
54
29
return new_articles
55
30
56
- def _create_new_task (self , url : str , domain : str ):
57
- for trie in range (MainConsts .TIMES_TRY_CREATE_TASK ):
31
+ @log_function
32
+ def _create_collecting_article_tasks (self , urls : List [str ], domain : str ):
33
+ for url in urls :
58
34
try :
59
- task_data = {
60
- "task_id" : str (uuid4 ()),
61
- "url" : url ,
62
- "domain" : domain ,
63
- "status" : "pending" ,
64
- "type" : MainConsts .COLLECT_ARTICLE ,
65
- "creation_time" : datetime .now ()
66
- }
67
- new_task : dict = Task (** task_data ).convert_to_dict ()
68
- inserted_id = self ._db .insert_one (table_name = DBConsts .TASKS_TABLE_NAME , data = new_task )
69
- self .logger .info (f"Created new task inserted_id: { inserted_id } " )
70
- return
35
+ self .task_utils .create_new_task (url = url , domain = domain , task_type = MainConsts .COLLECT_ARTICLE )
71
36
except Exception as e :
72
- self .logger .warning (f"Error create new task NO. { trie } /{ MainConsts .TIMES_TRY_CREATE_TASK } - { str (e )} " )
73
- continue
74
- desc = f"Error creating new task into db after { MainConsts .TIMES_TRY_CREATE_TASK } tries"
75
- raise InsertDataDBException (desc )
37
+ desc = f"Error creating new task with type: { MainConsts .COLLECT_ARTICLE } - { str (e )} "
38
+ self .logger .error (desc )
76
39
77
- def _handle_task (self , task : Task ):
78
- if task .type == MainConsts .COLLECT_URLS :
79
- website_scraper = websites_scrapers_factory (scraper_name = task .domain )
80
- urls = website_scraper .get_new_article_urls_from_home_page ()
81
- urls = self ._filter_only_not_exits_articles (urls = urls )
82
- for url in urls :
83
- try :
84
- self ._create_new_task (url = url , domain = task .domain )
85
- except Exception as e :
86
- desc = f"Error creating new task with type: { MainConsts .COLLECT_ARTICLE } - { str (e )} "
87
- self .logger .error (desc )
88
- elif task .type == MainConsts .COLLECT_ARTICLE :
89
- pass
40
+ @log_function
41
+ def _handle_task (self , task : Task ) -> bool :
42
+ try :
43
+ if task .type == MainConsts .COLLECT_URLS :
44
+ website_scraper = websites_scrapers_factory (scraper_name = task .domain )
45
+ urls = website_scraper .get_new_article_urls_from_home_page ()
46
+ urls = self ._filter_only_not_exits_articles (urls = urls )
47
+ self ._create_collecting_article_tasks (urls = urls , domain = task .domain )
48
+ self .logger .info (f"Done handle task of type: `{ task .type } `" )
49
+ elif task .type == MainConsts .COLLECT_ARTICLE :
50
+ pass
51
+ except Exception as e :
52
+ desc = f"Failed run task task_id: `{ task .task_id } `, type: `{ task .type } ` - { str (e )} "
53
+ self .logger .error (desc )
54
+ return False
55
+ return True
90
56
57
+ @log_function
91
58
def run (self ):
92
59
while True :
93
60
try :
94
- task = self ._get_new_task ()
61
+ task = self .task_utils . get_new_task ()
95
62
if task :
96
- self ._update_task_status (task_id = task .task_id , status = "running" )
97
- self ._handle_task (task = task )
63
+ self .logger = get_current_logger (task_id = task .task_id , task_type = task .type )
64
+ self .task_utils .update_task_status (task = task , status = "running" )
65
+ is_task_succeeded = self ._handle_task (task = task )
66
+ if is_task_succeeded :
67
+ self .task_utils .update_task_status (task = task , status = "succeeded" )
68
+ else :
69
+ self .task_utils .update_task_status (task = task , status = "failed" )
98
70
else :
99
- self .logger .debug (f"Couldn't find task, sleeping for { self .SLEEPING_TIME / 60 } minutes" )
71
+ self .logger .warning (f"Couldn't find task, sleeping for { self .SLEEPING_TIME / 60 } minutes" )
100
72
sleep (self .SLEEPING_TIME )
101
73
except ConnectionFailure as e :
102
74
self .logger .warning (f"Error connecting to db, initialize the db again - { str (e )} " )
0 commit comments