diff --git a/data_collection/gazette/spiders/ba/ba_amelia_rodrigues.py b/data_collection/gazette/spiders/ba/ba_amelia_rodrigues.py index de4df8c80..7ba4f3d49 100644 --- a/data_collection/gazette/spiders/ba/ba_amelia_rodrigues.py +++ b/data_collection/gazette/spiders/ba/ba_amelia_rodrigues.py @@ -7,5 +7,5 @@ class BaAmeliaRodriguesSpider(ImprensaOficialSpider): name = "ba_amelia_rodrigues" allowed_domains = ["pmameliarodriguesba.imprensaoficial.org"] start_date = date(2015, 1, 1) - url_base = "http://pmameliarodriguesba.imprensaoficial.org/{}" + city_domain = "http://pmameliarodriguesba.imprensaoficial.org" TERRITORY_ID = "2930501" diff --git a/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida.py b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida.py deleted file mode 100644 index 537744b3b..000000000 --- a/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida.py +++ /dev/null @@ -1,14 +0,0 @@ -from datetime import date - -from gazette.spiders.base.imprensa_oficial import ImprensaOficialSpider - - -class BaConceicaoDoAlmeidaSpider(ImprensaOficialSpider): - name = "ba_conceicao_do_almeida" - allowed_domains = [ - "pmconceicaodoalmeidaba.imprensaoficial.org", - "conceicaodoalmeida.ba.gov.br", - ] - start_date = date(2019, 5, 1) - url_base = "http://conceicaodoalmeida.ba.gov.br/{}" - TERRITORY_ID = "2908309" diff --git a/data_collection/gazette/spiders/ba/ba_gentio_do_ouro.py b/data_collection/gazette/spiders/ba/ba_gentio_do_ouro.py index 6a0f00aef..d007f73ea 100644 --- a/data_collection/gazette/spiders/ba/ba_gentio_do_ouro.py +++ b/data_collection/gazette/spiders/ba/ba_gentio_do_ouro.py @@ -7,5 +7,5 @@ class BaGentioDoOuroSpider(ImprensaOficialSpider): name = "ba_gentio_do_ouro" allowed_domains = ["pmgentiodoouroba.imprensaoficial.org"] start_date = date(2017, 2, 1) - url_base = "http://pmgentiodoouroba.imprensaoficial.org/{}" + city_domain = "http://pmgentiodoouroba.imprensaoficial.org" TERRITORY_ID = "2911303" diff --git a/data_collection/gazette/spiders/ba/ba_gongogi.py b/data_collection/gazette/spiders/ba/ba_gongogi.py index 339637dac..efcd347ab 100644 --- a/data_collection/gazette/spiders/ba/ba_gongogi.py +++ b/data_collection/gazette/spiders/ba/ba_gongogi.py @@ -1,12 +1,11 @@ from datetime import date -from gazette.spiders.base.imprensa_oficial import ImprensaOficialSpider +from gazette.spiders.base.sai import SaiGazetteSpider -class BaGongogiSpider(ImprensaOficialSpider): +class BaGongogi(SaiGazetteSpider): name = "ba_gongogi" - allowed_domains = ["pmgongogiba.imprensaoficial.org"] - start_date = date(2020, 2, 1) - end_date = date(2020, 12, 30) - url_base = "http://pmgongogiba.imprensaoficial.org/{}" + allowed_domains = ["gongogi.ba.gov.br"] + start_date = date(2005, 8, 15) + base_url = "https://www.gongogi.ba.gov.br" TERRITORY_ID = "2911501" diff --git a/data_collection/gazette/spiders/ba/ba_governador_mangabeira.py b/data_collection/gazette/spiders/ba/ba_governador_mangabeira.py index 79a0eb718..b6f2f975c 100644 --- a/data_collection/gazette/spiders/ba/ba_governador_mangabeira.py +++ b/data_collection/gazette/spiders/ba/ba_governador_mangabeira.py @@ -5,7 +5,7 @@ class BaGovernadorMangabeiraSpider(ImprensaOficialSpider): name = "ba_governador_mangabeira" - allowed_domains = ["pmGOVERNADORMANGABEIRABA.imprensaoficial.org"] + allowed_domains = ["pmgovernadormangabeiraba.imprensaoficial.org"] start_date = date(2018, 1, 1) - url_base = "http://pmGOVERNADORMANGABEIRABA.imprensaoficial.org/{}" + city_domain = "http://pmgovernadormangabeiraba.imprensaoficial.org" TERRITORY_ID = "2911600" diff --git a/data_collection/gazette/spiders/ba/ba_itaquara.py b/data_collection/gazette/spiders/ba/ba_itaquara_2019.py similarity index 53% rename from data_collection/gazette/spiders/ba/ba_itaquara.py rename to data_collection/gazette/spiders/ba/ba_itaquara_2019.py index 86b203019..23ace6275 100644 --- a/data_collection/gazette/spiders/ba/ba_itaquara.py +++ b/data_collection/gazette/spiders/ba/ba_itaquara_2019.py @@ -4,8 +4,9 @@ class BaItaquaraSpider(ImprensaOficialSpider): - name = "ba_itaquara" - allowed_domains = ["pmitaquaraba.imprensaoficial.org", "itaquara.ba.gov.br"] + name = "ba_itaquara_2019" + allowed_domains = ["pmitaquaraba.imprensaoficial.org"] start_date = date(2019, 1, 1) - url_base = "http://itaquara.ba.gov.br/{}" + end_date = date(2022, 1, 4) + city_domain = "http://pmitaquaraba.imprensaoficial.org" TERRITORY_ID = "2916708" diff --git a/data_collection/gazette/spiders/ba/ba_jaguarari.py b/data_collection/gazette/spiders/ba/ba_jaguarari.py index 2a3156e07..3349325ef 100644 --- a/data_collection/gazette/spiders/ba/ba_jaguarari.py +++ b/data_collection/gazette/spiders/ba/ba_jaguarari.py @@ -8,5 +8,5 @@ class BaJaguarariSpider(ImprensaOficialSpider): allowed_domains = ["pmjaguarariba.imprensaoficial.org"] start_date = date(2019, 10, 1) end_date = date(2020, 12, 31) - url_base = "http://pmjaguarariba.imprensaoficial.org/{}" + city_domain = "http://pmjaguarariba.imprensaoficial.org" TERRITORY_ID = "2917706" diff --git a/data_collection/gazette/spiders/ba/ba_muniz_ferreira.py b/data_collection/gazette/spiders/ba/ba_muniz_ferreira.py index ed694c6fa..0c83a6d19 100644 --- a/data_collection/gazette/spiders/ba/ba_muniz_ferreira.py +++ b/data_collection/gazette/spiders/ba/ba_muniz_ferreira.py @@ -7,6 +7,6 @@ class BaMunizFerreiraSpider(ImprensaOficialSpider): name = "ba_muniz_ferreira" allowed_domains = ["pmmunizferreiraba.imprensaoficial.org"] start_date = date(2014, 12, 1) - end_date = date(2021, 1, 19) - url_base = "http://pmmunizferreiraba.imprensaoficial.org/{}" + end_date = date(2022, 9, 27) + city_domain = "http://pmmunizferreiraba.imprensaoficial.org" TERRITORY_ID = "2922201" diff --git a/data_collection/gazette/spiders/ba/ba_paratinga.py b/data_collection/gazette/spiders/ba/ba_paratinga.py index 946dc56e8..0a36b5e19 100644 --- a/data_collection/gazette/spiders/ba/ba_paratinga.py +++ b/data_collection/gazette/spiders/ba/ba_paratinga.py @@ -7,5 +7,5 @@ class BaParatingaSpider(ImprensaOficialSpider): name = "ba_paratinga" allowed_domains = ["pmparatingaba.imprensaoficial.org"] start_date = date(2018, 4, 1) - url_base = "http://pmparatingaba.imprensaoficial.org/{}" + city_domain = "http://pmparatingaba.imprensaoficial.org" TERRITORY_ID = "2923704" diff --git a/data_collection/gazette/spiders/ba/ba_pe_de_serra.py b/data_collection/gazette/spiders/ba/ba_pe_de_serra.py index 12ee34dc3..2a5e87144 100644 --- a/data_collection/gazette/spiders/ba/ba_pe_de_serra.py +++ b/data_collection/gazette/spiders/ba/ba_pe_de_serra.py @@ -7,5 +7,5 @@ class BaPeDeSerraSpider(ImprensaOficialSpider): name = "ba_pe_de_serra" allowed_domains = ["pmpedeserraba.imprensaoficial.org"] start_date = date(2017, 1, 1) - url_base = "http://pmpedeserraba.imprensaoficial.org/{}" + city_domain = "http://pmpedeserraba.imprensaoficial.org" TERRITORY_ID = "2924058" diff --git a/data_collection/gazette/spiders/ba/ba_sao_felipe.py b/data_collection/gazette/spiders/ba/ba_sao_felipe.py index 239be27d5..e8e65f155 100644 --- a/data_collection/gazette/spiders/ba/ba_sao_felipe.py +++ b/data_collection/gazette/spiders/ba/ba_sao_felipe.py @@ -8,5 +8,5 @@ class BaSaoFelipeSpider(ImprensaOficialSpider): allowed_domains = ["pmsaofelipeba.imprensaoficial.org"] start_date = date(2020, 1, 1) end_date = date(2021, 4, 22) - url_base = "http://pmsaofelipeba.imprensaoficial.org/{}" + city_domain = "http://pmsaofelipeba.imprensaoficial.org" TERRITORY_ID = "2929107" diff --git a/data_collection/gazette/spiders/ba/ba_sao_francisco_do_conde.py b/data_collection/gazette/spiders/ba/ba_sao_francisco_do_conde.py index b1e70e55a..c3ebc6804 100644 --- a/data_collection/gazette/spiders/ba/ba_sao_francisco_do_conde.py +++ b/data_collection/gazette/spiders/ba/ba_sao_francisco_do_conde.py @@ -7,5 +7,5 @@ class BaSaoFranciscoDoCondeSpider(ImprensaOficialSpider): name = "ba_sao_francisco_do_conde" allowed_domains = ["pmsaofranciscodocondeba.imprensaoficial.org"] start_date = date(2019, 3, 1) - url_base = "http://pmsaofranciscodocondeba.imprensaoficial.org/{}" + city_domain = "http://pmsaofranciscodocondeba.imprensaoficial.org" TERRITORY_ID = "2929206" diff --git a/data_collection/gazette/spiders/ba/ba_sao_miguel_das_matas.py b/data_collection/gazette/spiders/ba/ba_sao_miguel_das_matas.py index 0c6494cec..09f2c100d 100644 --- a/data_collection/gazette/spiders/ba/ba_sao_miguel_das_matas.py +++ b/data_collection/gazette/spiders/ba/ba_sao_miguel_das_matas.py @@ -5,10 +5,7 @@ class BaSaoMiguelDasMatasSpider(ImprensaOficialSpider): name = "ba_sao_miguel_das_matas" - allowed_domains = [ - "pmsaomigueldasmatasba.imprensaoficial.org", - "saomigueldasmatas.ba.gov.br", - ] + allowed_domains = ["pmsaomigueldasmatasba.imprensaoficial.org"] start_date = date(2019, 2, 1) - url_base = "http://saomigueldasmatas.ba.gov.br/{}" + city_domain = "http://pmsaomigueldasmatasba.imprensaoficial.org" TERRITORY_ID = "2929404" diff --git a/data_collection/gazette/spiders/ba/ba_sapeacu.py b/data_collection/gazette/spiders/ba/ba_sapeacu.py index 50d7bb1cf..856f444e3 100644 --- a/data_collection/gazette/spiders/ba/ba_sapeacu.py +++ b/data_collection/gazette/spiders/ba/ba_sapeacu.py @@ -5,7 +5,7 @@ class BaSapeacuSpider(ImprensaOficialSpider): name = "ba_sapeacu" - allowed_domains = ["pmsapeacuba.imprensaoficial.org", "sapeacu.ba.gov.br"] + allowed_domains = ["pmsapeacuba.imprensaoficial.org"] start_date = date(2017, 1, 1) - url_base = "http://sapeacu.ba.gov.br/{}" + city_domain = "http://pmsapeacuba.imprensaoficial.org" TERRITORY_ID = "2929602" diff --git a/data_collection/gazette/spiders/ba/ba_saude.py b/data_collection/gazette/spiders/ba/ba_saude_2018.py similarity index 77% rename from data_collection/gazette/spiders/ba/ba_saude.py rename to data_collection/gazette/spiders/ba/ba_saude_2018.py index 2d8bc514a..8e617943a 100644 --- a/data_collection/gazette/spiders/ba/ba_saude.py +++ b/data_collection/gazette/spiders/ba/ba_saude_2018.py @@ -4,9 +4,9 @@ class BaSaudeSpider(ImprensaOficialSpider): - name = "ba_saude" + name = "ba_saude_2018" allowed_domains = ["pmsaudeba.imprensaoficial.org"] start_date = date(2018, 2, 1) end_date = date(2019, 4, 12) - url_base = "http://pmsaudeba.imprensaoficial.org/{}" + city_domain = "http://pmsaudeba.imprensaoficial.org" TERRITORY_ID = "2929800" diff --git a/data_collection/gazette/spiders/ba/ba_serrinha.py b/data_collection/gazette/spiders/ba/ba_serrinha.py index 6b72fd2d1..c70042bb4 100644 --- a/data_collection/gazette/spiders/ba/ba_serrinha.py +++ b/data_collection/gazette/spiders/ba/ba_serrinha.py @@ -7,5 +7,5 @@ class BaSerrinhaSpider(ImprensaOficialSpider): name = "ba_serrinha" allowed_domains = ["pmserrinhaba.imprensaoficial.org"] start_date = date(2020, 1, 1) - url_base = "http://pmserrinhaba.imprensaoficial.org/{}" + city_domain = "http://pmserrinhaba.imprensaoficial.org" TERRITORY_ID = "2930501" diff --git a/data_collection/gazette/spiders/ba/ba_vera_cruz.py b/data_collection/gazette/spiders/ba/ba_vera_cruz.py index 4a03d8f7b..987419095 100644 --- a/data_collection/gazette/spiders/ba/ba_vera_cruz.py +++ b/data_collection/gazette/spiders/ba/ba_vera_cruz.py @@ -7,5 +7,5 @@ class BaVeraCruzSpider(ImprensaOficialSpider): name = "ba_vera_cruz" allowed_domains = ["pmveracruzba.imprensaoficial.org"] start_date = date(2017, 4, 1) - url_base = "http://pmveracruzba.imprensaoficial.org/{}" + city_domain = "http://pmveracruzba.imprensaoficial.org" TERRITORY_ID = "2933208" diff --git a/data_collection/gazette/spiders/ba/ba_wenceslau_guimaraes.py b/data_collection/gazette/spiders/ba/ba_wenceslau_guimaraes.py index 156f90ab6..325655e35 100644 --- a/data_collection/gazette/spiders/ba/ba_wenceslau_guimaraes.py +++ b/data_collection/gazette/spiders/ba/ba_wenceslau_guimaraes.py @@ -7,5 +7,5 @@ class BaWenceslauGuimaraesSpider(ImprensaOficialSpider): name = "ba_wenceslau_guimaraes" allowed_domains = ["pmwenceslauguimaraesba.imprensaoficial.org"] start_date = date(2017, 1, 1) - url_base = "http://pmwenceslauguimaraesba.imprensaoficial.org/{}" + city_domain = "http://pmwenceslauguimaraesba.imprensaoficial.org" TERRITORY_ID = "2933505" diff --git a/data_collection/gazette/spiders/ba/ba_xique_xique.py b/data_collection/gazette/spiders/ba/ba_xique_xique.py index 882249656..65f297428 100644 --- a/data_collection/gazette/spiders/ba/ba_xique_xique.py +++ b/data_collection/gazette/spiders/ba/ba_xique_xique.py @@ -7,5 +7,5 @@ class BaXiqueXiqueSpider(ImprensaOficialSpider): name = "ba_xique_xique" allowed_domains = ["pmxiquexiqueba.imprensaoficial.org"] start_date = date(2017, 1, 1) - url_base = "http://pmxiquexiqueba.imprensaoficial.org/{}" + city_domain = "http://pmxiquexiqueba.imprensaoficial.org" TERRITORY_ID = "2933604" diff --git a/data_collection/gazette/spiders/base/imprensa_oficial.py b/data_collection/gazette/spiders/base/imprensa_oficial.py index f64e48b64..369b795ae 100644 --- a/data_collection/gazette/spiders/base/imprensa_oficial.py +++ b/data_collection/gazette/spiders/base/imprensa_oficial.py @@ -16,14 +16,17 @@ def start_requests(self): freq=MONTHLY, dtstart=initial_date, until=self.end_date ): year_month = monthly_date.strftime("%Y/%m/") # like 2015/01 - yield scrapy.Request( - self.url_base.format(year_month), callback=self.extract_gazette_links - ) + url = f"{self.city_domain}/{year_month}" + + yield scrapy.Request(url, callback=self.extract_gazette_links) def extract_gazette_links(self, response): - for gazette_link in response.css("h2 a::attr(href)").getall(): + links = response.css("h2 a::attr(href)").getall() + + for gazette_link in links: raw_gazette_date = re.search(r"\d{4}/\d{2}/\d{2}", gazette_link).group() gazette_date = datetime.strptime(raw_gazette_date, "%Y/%m/%d").date() + if gazette_date < self.start_date: return yield scrapy.Request(gazette_link) @@ -32,6 +35,7 @@ def extract_gazette_links(self, response): another_page = response.xpath( ".//a[contains(text(), 'Publicações mais antigas')]/@href" ).get() + if another_page: yield scrapy.Request(another_page, callback=self.extract_gazette_links) @@ -39,10 +43,12 @@ def parse(self, response): file_url = response.css( "div.entry-content a[href*='baixar.php?arquivo=']::attr(href)" ).get() + if not file_url: # older dates file_url = response.css( "div.entry-content a[title='Baixar Diário']::attr(href)" ).get() + gazette_date = response.css("span.posted-on a time::attr(datetime)").get() gazette_date = datetime.strptime(gazette_date, "%Y-%m-%dT%H:%M:%S%z").date()