Skip to content

Commit

Permalink
correções(fabio)
Browse files Browse the repository at this point in the history
  • Loading branch information
JP0ttoni committed Dec 12, 2024
1 parent 0f0d091 commit ad584b1
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 82 deletions.
73 changes: 0 additions & 73 deletions data_collection/gazette/spiders/base/base_RG_sites.py

This file was deleted.

56 changes: 56 additions & 0 deletions data_collection/gazette/spiders/base/rgsites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import re
from datetime import datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseRgSites(BaseGazetteSpider):
def start_requests(self):
yield scrapy.Request(self.BASE_URL)

def parse(self, response):
self.end_date
years = response.css('div[role="tabpanel"]')
years.reverse()
for year in years:
year_temp = year.css("::attr(id)").get()
year_temp = year_temp.replace("tab_", "")
if int(year_temp) < self.start_date.year:
continue
months = year.css("div.panel.panel-primary.rg-border-radius-none")
for month in months:
days = month.css("td.edicao")
for day in days:
edition = day.css('a[data-toggle="modal-pdf"]::text').get()
edition = re.sub(r"\D", "", edition)
url_pdf = day.css('a[data-toggle="modal-pdf"]::attr(href)').get()
date_temp = day.css("span.visible-xs-inline-block::text").get()
date_temp = (
date_temp.strip()
.replace("\xa0", "")
.replace("(", "")
.replace(")", "")
)
date_temp = dt.strptime(date_temp, "%d/%m/%Y").date()
if (
int(year_temp) == self.start_date.year
and date_temp.month < self.start_date.month
):
break

if date_temp < self.start_date:
continue

if date_temp > self.end_date:
return
else:
yield Gazette(
date=date_temp,
edition_number=edition,
is_extra_edition=False,
file_urls=[url_pdf],
power="executive",
)
6 changes: 3 additions & 3 deletions data_collection/gazette/spiders/mg/mg_esmeraldas.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from datetime import date

from gazette.spiders.base.base_RG_sites import Base_RgSites
from gazette.spiders.base.rgsites import BaseRgSites


class UFMunicipioSpider(Base_RgSites):
class UFMunicipioSpider(BaseRgSites):
name = "mg_esmeraldas"
TERRITORY_ID = "3124104"
allowed_domains = ["www.esmeraldas.mg.gov.br"]
start_urls = ["https://www.esmeraldas.mg.gov.br/diario-oficial-eletronico"]
BASE_URL = "https://www.esmeraldas.mg.gov.br/diario-oficial-eletronico"
start_date = date(2021, 6, 12)
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from datetime import date

from gazette.spiders.base.base_RG_sites import Base_RgSites
from gazette.spiders.base.rgsites import BaseRgSites


class UFMunicipioSpider(Base_RgSites):
class UFMunicipioSpider(BaseRgSites):
name = "mg_sao_joao_batista_do_gloria"
TERRITORY_ID = "3162203"
allowed_domains = ["www.gloria.mg.gov.br"]
start_urls = ["https://www.gloria.mg.gov.br/diario-oficial"]
BASE_URL = "https://www.gloria.mg.gov.br/diario-oficial"
start_date = date(2019, 1, 3)
6 changes: 3 additions & 3 deletions data_collection/gazette/spiders/rj/rj_cantagalo.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from datetime import date

from gazette.spiders.base.base_RG_sites import Base_RgSites
from gazette.spiders.base.rgsites import BaseRgSites


class UFMunicipioSpider(Base_RgSites):
class UFMunicipioSpider(BaseRgSites):
name = "rj_cantagalo"
TERRITORY_ID = "3301108"
allowed_domains = ["www.cantagalo.rj.gov.br"]
start_urls = ["https://www.cantagalo.rj.gov.br/transparencia/diario-oficial"]
BASE_URL = "https://www.cantagalo.rj.gov.br/transparencia/diario-oficial"
start_date = date(2018, 3, 26)

0 comments on commit ad584b1

Please sign in to comment.