diff --git a/data_collection/gazette/spiders/base/base_RG_sites.py b/data_collection/gazette/spiders/base/base_RG_sites.py new file mode 100644 index 000000000..d32e58820 --- /dev/null +++ b/data_collection/gazette/spiders/base/base_RG_sites.py @@ -0,0 +1,73 @@ +import re +from datetime import datetime as dt + +import scrapy + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class Base_RgSites(BaseGazetteSpider): + def start_requests(self): + start_urls = self.start_urls[0] + yield scrapy.FormRequest( + method="GET", + url=start_urls, + ) + + def parse(self, response): + self.end_date + years = response.css( + 'div[role="tabpanel"]' + ) # retorna vetor com os elementos separados por ano + years.reverse() + for year in years: + ano = year.css("::attr(id)").get() + ano = ano.replace("tab_", "") + if int(ano) < self.start_date.year: + continue + months = year.css( + "div.panel.panel-primary.rg-border-radius-none" + ) # retorna vetor com os elementos separados por meses do ano selecionado + for month in months: + days = month.css("td.edicao") + for day in days: + edicao = day.css('a[data-toggle="modal-pdf"]::text').get() + edicao = re.sub(r"\D", "", edicao) + url_pdf = day.css('a[data-toggle="modal-pdf"]::attr(href)').get() + data = day.css("span.visible-xs-inline-block::text").get() + data = ( + data.strip() + .replace("\xa0", "") + .replace("(", "") + .replace(")", "") + ) + data = dt.strptime(data, "%d/%m/%Y").date() + if ( + int(ano) == self.start_date.year + and data.month < self.start_date.month + ): + break + + if data < self.start_date: + continue + + if data > self.end_date: + return + else: + yield Gazette( + date=data, # dt.strptime(raw_gazette_date, "%d/%m/%Y").date() + edition_number=edicao, + is_extra_edition=False, + file_urls=[url_pdf], + power="executive", + ) + + # Lógica de extração de metadados + + # partindo de response ... + # + # ... o que deve ser feito para coletar DATA DO DIÁRIO? + # ... o que deve ser feito para coletar NÚMERO DA EDIÇÃO? + # ... o que deve ser feito para coletar se a EDIÇÃO É EXTRA? + # ... o que deve ser feito para coletar a URL DE DOWNLOAD do arquivo? diff --git a/data_collection/gazette/spiders/mg/mg_esmeraldas.py b/data_collection/gazette/spiders/mg/mg_esmeraldas.py new file mode 100644 index 000000000..c9b6431a2 --- /dev/null +++ b/data_collection/gazette/spiders/mg/mg_esmeraldas.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.base_RG_sites import Base_RgSites + + +class UFMunicipioSpider(Base_RgSites): + name = "mg_esmeraldas" + TERRITORY_ID = "3124104" + allowed_domains = ["www.esmeraldas.mg.gov.br"] + start_urls = ["https://www.esmeraldas.mg.gov.br/diario-oficial-eletronico"] + start_date = date(2021, 6, 12) diff --git a/data_collection/gazette/spiders/mg/mg_sao_joao_batista_do_gloria.py b/data_collection/gazette/spiders/mg/mg_sao_joao_batista_do_gloria.py new file mode 100644 index 000000000..de3516651 --- /dev/null +++ b/data_collection/gazette/spiders/mg/mg_sao_joao_batista_do_gloria.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.base_RG_sites import Base_RgSites + + +class UFMunicipioSpider(Base_RgSites): + name = "mg_sao_joao_batista_do_gloria" + TERRITORY_ID = "3162203" + allowed_domains = ["www.gloria.mg.gov.br"] + start_urls = ["https://www.gloria.mg.gov.br/diario-oficial"] + start_date = date(2019, 1, 3) diff --git a/data_collection/gazette/spiders/rj/rj_cantagalo.py b/data_collection/gazette/spiders/rj/rj_cantagalo.py new file mode 100644 index 000000000..58f9e0de3 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_cantagalo.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.base_RG_sites import Base_RgSites + + +class UFMunicipioSpider(Base_RgSites): + name = "rj_cantagalo" + TERRITORY_ID = "3301108" + allowed_domains = ["www.cantagalo.rj.gov.br"] + start_urls = ["https://www.cantagalo.rj.gov.br/transparencia/diario-oficial"] + start_date = date(2018, 3, 26)