Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

finalizando issue 1245 #1332

Closed
wants to merge 12 commits into from
73 changes: 73 additions & 0 deletions data_collection/gazette/spiders/base/base_RG_sites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
from datetime import datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class Base_RgSites(BaseGazetteSpider):
def start_requests(self):
start_urls = self.start_urls[0]
yield scrapy.FormRequest(
method="GET",
url=start_urls,
)

def parse(self, response):
self.end_date
years = response.css(
'div[role="tabpanel"]'
) # retorna vetor com os elementos separados por ano
years.reverse()
for year in years:
ano = year.css("::attr(id)").get()
ano = ano.replace("tab_", "")
if int(ano) < self.start_date.year:
continue
months = year.css(
"div.panel.panel-primary.rg-border-radius-none"
) # retorna vetor com os elementos separados por meses do ano selecionado
for month in months:
days = month.css("td.edicao")
for day in days:
edicao = day.css('a[data-toggle="modal-pdf"]::text').get()
edicao = re.sub(r"\D", "", edicao)
url_pdf = day.css('a[data-toggle="modal-pdf"]::attr(href)').get()
data = day.css("span.visible-xs-inline-block::text").get()
data = (
data.strip()
.replace("\xa0", "")
.replace("(", "")
.replace(")", "")
)
data = dt.strptime(data, "%d/%m/%Y").date()
if (
int(ano) == self.start_date.year
and data.month < self.start_date.month
):
break

if data < self.start_date:
continue

if data > self.end_date:
return
else:
yield Gazette(
date=data, # dt.strptime(raw_gazette_date, "%d/%m/%Y").date()
edition_number=edicao,
is_extra_edition=False,
file_urls=[url_pdf],
power="executive",
)

# Lógica de extração de metadados

# partindo de response ...
#
# ... o que deve ser feito para coletar DATA DO DIÁRIO?
# ... o que deve ser feito para coletar NÚMERO DA EDIÇÃO?
# ... o que deve ser feito para coletar se a EDIÇÃO É EXTRA?
# ... o que deve ser feito para coletar a URL DE DOWNLOAD do arquivo?
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/mg/mg_esmeraldas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.base_RG_sites import Base_RgSites


class UFMunicipioSpider(Base_RgSites):
name = "mg_esmeraldas"
TERRITORY_ID = "3124104"
allowed_domains = ["www.esmeraldas.mg.gov.br"]
start_urls = ["https://www.esmeraldas.mg.gov.br/diario-oficial-eletronico"]
start_date = date(2021, 6, 12)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.base_RG_sites import Base_RgSites


class UFMunicipioSpider(Base_RgSites):
name = "mg_sao_joao_batista_do_gloria"
TERRITORY_ID = "3162203"
allowed_domains = ["www.gloria.mg.gov.br"]
start_urls = ["https://www.gloria.mg.gov.br/diario-oficial"]
start_date = date(2019, 1, 3)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rj/rj_cantagalo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.base_RG_sites import Base_RgSites


class UFMunicipioSpider(Base_RgSites):
name = "rj_cantagalo"
TERRITORY_ID = "3301108"
allowed_domains = ["www.cantagalo.rj.gov.br"]
start_urls = ["https://www.cantagalo.rj.gov.br/transparencia/diario-oficial"]
start_date = date(2018, 3, 26)
Loading