1
1
import re
2
+ from datetime import date
2
3
3
4
import dateparser
5
+ from fuzzywuzzy import process
4
6
from scrapy import Request
5
7
6
8
from gazette .items import Gazette
7
9
from gazette .spiders .base import BaseGazetteSpider
8
10
9
11
10
12
class RjCampoGoytacazesSpider (BaseGazetteSpider ):
13
+ name = "rj_campos_goytacazes"
11
14
TERRITORY_ID = "3301009"
12
-
13
15
allowed_domains = ["www.campos.rj.gov.br" ]
14
- name = "rj_campos_goytacazes"
15
- start_urls = [
16
- "https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15"
16
+ start_urls = ["https://www.campos.rj.gov.br/diario-oficial.php" ]
17
+ start_date = date (2013 , 11 , 1 )
18
+ months = [
19
+ "janeiro" ,
20
+ "fevereiro" ,
21
+ "março" ,
22
+ "abril" ,
23
+ "maio" ,
24
+ "junho" ,
25
+ "julho" ,
26
+ "agosto" ,
27
+ "setembro" ,
28
+ "outubro" ,
29
+ "novembro" ,
30
+ "dezembro" ,
17
31
]
18
32
19
33
def parse (self , response ):
20
- """
21
- @url https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15
22
- @returns requests 1
23
- @returns items 15 15
24
- @scrapes date file_urls is_extra_edition power
25
- """
26
-
27
34
for element in response .css ("ul.ul-licitacoes li" ):
35
+ gazette_data = element .css ("h4::text" )
28
36
gazette_text = element .css ("h4::text" ).get ("" )
29
37
30
- date_re = re . search ( r"(\d{2} de (.*) de \d{4})" , gazette_text )
31
- if not date_re :
38
+ date = self . extract_date ( gazette_text )
39
+ if not date or date > self . end_date :
32
40
continue
41
+ if date < self .start_date :
42
+ return
33
43
34
- date = date_re .group (0 )
35
- # The extra edition for August 28th, 2018 has a typo in the month name.
36
- date = date .replace ("Agosoto" , "Agosto" )
37
- # The edition for December 17th, 2012 has a typo in the month name.
38
- date = date .replace ("Dezembrbo" , "Dezembro" )
39
- date = dateparser .parse (date , languages = ["pt" ]).date ()
44
+ edition_number = gazette_data .re_first (r"Edição.*\s(\d+)" )
40
45
41
46
path_to_gazette = element .css ("a::attr(href)" ).get ().strip ()
42
47
# From November 17th, 2017 and backwards the path to the gazette PDF
43
48
# is relative.
44
49
if path_to_gazette .startswith ("up/diario_oficial.php" ):
45
50
path_to_gazette = response .urljoin (path_to_gazette )
46
51
47
- is_extra_edition = gazette_text .startswith ("Suplemento" )
52
+ is_extra_edition = bool (
53
+ re .search (r"extra|supl|revis" , gazette_text , re .IGNORECASE )
54
+ )
48
55
49
56
yield Gazette (
50
57
date = date ,
51
- file_urls = [ path_to_gazette ] ,
58
+ edition_number = edition_number ,
52
59
is_extra_edition = is_extra_edition ,
60
+ file_urls = [path_to_gazette ],
53
61
power = "executive" ,
54
62
)
55
63
@@ -60,3 +68,32 @@ def parse(self, response):
60
68
)
61
69
if next_url :
62
70
yield Request (response .urljoin (next_url ))
71
+
72
+ def extract_date (self , text ):
73
+ """Extract a date from a text. This method attempts to correct typing errors in the month.
74
+
75
+ Args:
76
+ text: A text containing a date with the name of the month full version (%B)
77
+
78
+ Returns:
79
+ The date, if match. Otherwise, returns None.
80
+ """
81
+
82
+ match_date = re .search (r"\d{1,2}º?(\sde)? +(\w+)(\sde)? +\d{4}" , text )
83
+ if not match_date :
84
+ return None
85
+
86
+ raw_date = match_date .group (0 )
87
+ raw_date = raw_date .replace ("º" , "" ).replace ("°" , "" )
88
+ month = match_date .group (2 )
89
+ if month .lower () not in self .months :
90
+ match_month , score = process .extractOne (month , self .months )
91
+ if score < 70 :
92
+ return None
93
+ raw_date = raw_date .replace (month , match_month )
94
+ self .logger .warning (
95
+ f' Erro de digitação em "{ text } ". CORRIGIDO DE { month } PARA { match_month } '
96
+ )
97
+
98
+ parsed_datetime = dateparser .parse (raw_date , languages = ["pt" ])
99
+ return parsed_datetime .date () if parsed_datetime else None
0 commit comments