From 3938162b8cdd009d027bd1240f8b836e056be4bf Mon Sep 17 00:00:00 2001 From: Victor Raton <43411882+victorfernandesraton@users.noreply.github.com> Date: Sat, 7 Sep 2024 20:37:13 -0300 Subject: [PATCH] =?UTF-8?q?fix:=20removendo=20m=C3=A9todos=20e=20otimizand?= =?UTF-8?q?o=20a=20classe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/ma/ma_sao_jose_dos_basilios.py | 61 +++---------------- 1 file changed, 10 insertions(+), 51 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py index f43fa4c83..d1d2358e7 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py +++ b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py @@ -15,17 +15,7 @@ class MaSaoJoseDosBasiliosSpider(BaseGazetteSpider): BASE_URL = "https://diariooficial.saojosedosbasilios.ma.gov.br" def start_requests(self): - yield scrapy.Request(self.get_url()) - - def parse_pagination(self, response): - """ - This parse function is used to get all the pages available and - return request object for each one - """ - return [ - scrapy.Request(self.get_url(page), callback=self.parse) - for page in range(1, 1 + self.get_last_page(response)) - ] + yield scrapy.Request(f"{self.BASE_URL}/home") def parse(self, response, page=1): """ @@ -37,7 +27,6 @@ def parse(self, response, page=1): for gazette_box in gazette_boxes: edition_number = self.get_edition_number(gazette_box) - file_url = self.get_pdf_url(edition_number) date = self.get_gazette_date(gazette_box) if date > self.end_date: @@ -47,58 +36,28 @@ def parse(self, response, page=1): yield Gazette( date=date, - file_urls=[file_url], + file_urls=[ + f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true" + ], edition_number=edition_number, power="executive_legislative", ) - last_page = self.get_last_page(response) - if page < last_page: - yield scrapy.Request( - url=self.get_url(page + 1), cb_kwargs={"page": page + 1} - ) - - def get_url(self, page=1): - return f"{self.BASE_URL}/home?page={page}" - - @staticmethod - def get_last_page(response): - """ - Gets the last page number available in the pages navigation menu - """ - pages = response.css("ul.pagination li.page-item a::text").getall() - if len(pages) == 0: - return 1 - return max([int(page) for page in pages if page.isnumeric()]) - - def get_pdf_url(self, edition_number): - """ - Gets the url for the gazette inside one of the 'div#edicoes-anteriores' table - """ - return f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true" + next_page_url = response.css("a.page-link[rel='next']::attr(href)").get() + if next_page_url: + yield scrapy.Request(url=next_page_url) def get_gazette_date(self, response_item): """ Get the date for the gazette inside one of the 'div#edicoes-anteriores' table """ - date = response_item.css("td:nth-child(3)::text").get().strip() - date_cut = self.__format_date(date) - return parse(date_cut, date_formats=["%d - %B - %Y"], languages=["pt"]).date() - - @staticmethod - def __format_date(date): - split_date = date.split(",") - return split_date[1] + date = response_item.css("td:nth-child(3)::text").get().strip().split(",")[1] + return parse(date, date_formats=["%d - %B - %Y"], languages=["pt"]).date() def get_edition_number(self, response_item): """ Get the edition number inside one of the 'div#edicoes-anteriores' table """ text_edition = response_item.css("td:nth-child(1) a::text").get().strip() - return self.__cut_edition_number(text_edition) - - @staticmethod - def __cut_edition_number(text): - split_text = text.split(" ") - split_number_year = split_text[3].split("/") + split_number_year = text_edition.split(" ")[3].split("/") return split_number_year[0]