Skip to content

Commit

Permalink
Cria spider base rgsites #1245
Browse files Browse the repository at this point in the history
corrigindo o nome das classes #1245

corrigindo o nome das classes dos municipios #1245
  • Loading branch information
JP0ttoni committed Feb 4, 2025
1 parent 338dbbe commit 2d239c1
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 0 deletions.
59 changes: 59 additions & 0 deletions data_collection/gazette/spiders/base/rgsites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import re
from datetime import date, datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseRgSites(BaseGazetteSpider):
def start_requests(self):
yield scrapy.Request(self.BASE_URL)

def parse(self, response):
month_after_end_date = date(self.end_date.year, self.end_date.month + 1, 1)
years = response.css('div[role="tabpanel"]')
for year in years:
year_temp = year.css("::attr(id)").get()
year_temp = int(year_temp.replace("tab_", ""))
if year_temp not in range(self.start_date.year, self.end_date.year + 1):
continue
months = year.css("div.panel.panel-primary.rg-border-radius-none")
months.reverse()
for month in months:
days = month.css("td.edicao")
days.reverse()
for day in days:
date_temp = day.css("span.visible-xs-inline-block::text").get()
date_temp = (
date_temp.strip()
.replace("\xa0", "")
.replace("(", "")
.replace(")", "")
)
date_temp = dt.strptime(date_temp, "%d/%m/%Y").date()

if date_temp >= month_after_end_date:
break

if date_temp > self.end_date:
continue

if date_temp < self.start_date:
return

raw_edition = day.css('a[data-toggle="modal-pdf"]::text').get()
extra = "extra" in raw_edition.lower()

edition = re.sub(r"\D", "", raw_edition)

url_pdf = day.css('a[data-toggle="modal-pdf"]::attr(href)').get()

yield Gazette(
date=date_temp,
edition_number=edition,
is_extra_edition=extra,
file_urls=[url_pdf],
power="executive",
)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/mg/mg_esmeraldas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.rgsites import BaseRgSites


class MgEsmeraldasSpider(BaseRgSites):
name = "mg_esmeraldas"
TERRITORY_ID = "3124104"
allowed_domains = ["www.esmeraldas.mg.gov.br"]
BASE_URL = "https://www.esmeraldas.mg.gov.br/diario-oficial-eletronico"
start_date = date(2021, 6, 12)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.rgsites import BaseRgSites


class MgSaoJoaoBatistaDoGloriaSpider(BaseRgSites):
name = "mg_sao_joao_batista_do_gloria"
TERRITORY_ID = "3162203"
allowed_domains = ["www.gloria.mg.gov.br"]
BASE_URL = "https://www.gloria.mg.gov.br/diario-oficial"
start_date = date(2019, 1, 3)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rj/rj_cantagalo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.rgsites import BaseRgSites


class RjCantagaloSpider(BaseRgSites):
name = "rj_cantagalo"
TERRITORY_ID = "3301108"
allowed_domains = ["www.cantagalo.rj.gov.br"]
BASE_URL = "https://www.cantagalo.rj.gov.br/transparencia/diario-oficial"
start_date = date(2018, 3, 26)

0 comments on commit 2d239c1

Please sign in to comment.