Skip to content

Commit 54de4b4

Browse files
committed
#637 adiciona start_date e edition_number para campos-rj
1 parent 578f7a9 commit 54de4b4

File tree

1 file changed

+58
-21
lines changed

1 file changed

+58
-21
lines changed
Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,63 @@
11
import re
2+
from datetime import date
23

34
import dateparser
5+
from fuzzywuzzy import process
46
from scrapy import Request
57

68
from gazette.items import Gazette
79
from gazette.spiders.base import BaseGazetteSpider
810

911

1012
class RjCampoGoytacazesSpider(BaseGazetteSpider):
13+
name = "rj_campos_goytacazes"
1114
TERRITORY_ID = "3301009"
12-
1315
allowed_domains = ["www.campos.rj.gov.br"]
14-
name = "rj_campos_goytacazes"
15-
start_urls = [
16-
"https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15"
16+
start_urls = ["https://www.campos.rj.gov.br/diario-oficial.php"]
17+
start_date = date(2013, 11, 1)
18+
months = [
19+
"janeiro",
20+
"fevereiro",
21+
"março",
22+
"abril",
23+
"maio",
24+
"junho",
25+
"julho",
26+
"agosto",
27+
"setembro",
28+
"outubro",
29+
"novembro",
30+
"dezembro",
1731
]
1832

1933
def parse(self, response):
20-
"""
21-
@url https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15
22-
@returns requests 1
23-
@returns items 15 15
24-
@scrapes date file_urls is_extra_edition power
25-
"""
26-
2734
for element in response.css("ul.ul-licitacoes li"):
35+
gazette_data = element.css("h4::text")
2836
gazette_text = element.css("h4::text").get("")
2937

30-
date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
31-
if not date_re:
38+
date = self.extract_date(gazette_text)
39+
if not date or date > self.end_date:
3240
continue
41+
if date < self.start_date:
42+
return
3343

34-
date = date_re.group(0)
35-
# The extra edition for August 28th, 2018 has a typo in the month name.
36-
date = date.replace("Agosoto", "Agosto")
37-
# The edition for December 17th, 2012 has a typo in the month name.
38-
date = date.replace("Dezembrbo", "Dezembro")
39-
date = dateparser.parse(date, languages=["pt"]).date()
44+
edition_number = gazette_data.re_first(r"Edição.*\s(\d+)")
4045

4146
path_to_gazette = element.css("a::attr(href)").get().strip()
4247
# From November 17th, 2017 and backwards the path to the gazette PDF
4348
# is relative.
4449
if path_to_gazette.startswith("up/diario_oficial.php"):
4550
path_to_gazette = response.urljoin(path_to_gazette)
4651

47-
is_extra_edition = gazette_text.startswith("Suplemento")
52+
is_extra_edition = bool(
53+
re.search(r"extra|supl|revis", gazette_text, re.IGNORECASE)
54+
)
4855

4956
yield Gazette(
5057
date=date,
51-
file_urls=[path_to_gazette],
58+
edition_number=edition_number,
5259
is_extra_edition=is_extra_edition,
60+
file_urls=[path_to_gazette],
5361
power="executive",
5462
)
5563

@@ -60,3 +68,32 @@ def parse(self, response):
6068
)
6169
if next_url:
6270
yield Request(response.urljoin(next_url))
71+
72+
def extract_date(self, text):
73+
"""Extract a date from a text. This method attempts to correct typing errors in the month.
74+
75+
Args:
76+
text: A text containing a date with the name of the month full version (%B)
77+
78+
Returns:
79+
The date, if match. Otherwise, returns None.
80+
"""
81+
82+
match_date = re.search(r"\d{1,2}º?(\sde)? +(\w+)(\sde)? +\d{4}", text)
83+
if not match_date:
84+
return None
85+
86+
raw_date = match_date.group(0)
87+
raw_date = raw_date.replace("º", "").replace("°", "")
88+
month = match_date.group(2)
89+
if month.lower() not in self.months:
90+
match_month, score = process.extractOne(month, self.months)
91+
if score < 70:
92+
return None
93+
raw_date = raw_date.replace(month, match_month)
94+
self.logger.warning(
95+
f' Erro de digitação em "{text}". CORRIGIDO DE {month} PARA {match_month}'
96+
)
97+
98+
parsed_datetime = dateparser.parse(raw_date, languages=["pt"])
99+
return parsed_datetime.date() if parsed_datetime else None

0 commit comments

Comments
 (0)