From c71c350eef36a92a50ea26283100e7255f85aadb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Rocha?= Date: Fri, 12 Jul 2024 08:30:40 -0300 Subject: [PATCH] Ajustar regex de coleta de metadado #1187 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Classe base Adiarios V1: - `edition_number` com valor padrão como string vazia; - `is_extra_edition` verificada no título e também no texto. --- data_collection/gazette/spiders/base/adiarios_v1.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/data_collection/gazette/spiders/base/adiarios_v1.py b/data_collection/gazette/spiders/base/adiarios_v1.py index 2d8f8b4e7..4b39af1d2 100644 --- a/data_collection/gazette/spiders/base/adiarios_v1.py +++ b/data_collection/gazette/spiders/base/adiarios_v1.py @@ -38,13 +38,20 @@ def parse_page(self, response): date = datetime.strptime(date, "%d/%m/%Y").date() text = element.css("span strong::text").get() - edition_number = re.search(r":\s*(\d+).*/", text).group(1) + + try: + edition_number = re.search(r":\s*(\d+).*/", text).group(1) + except AttributeError: + edition_number = "" title = element.css("span::text").getall()[1] is_extra_edition = bool( re.search( r"complementar|suplementar|extra|especial", title, re.IGNORECASE ) + or re.search( + r"complementar|suplementar|extra|especial", text, re.IGNORECASE + ) ) power = self.get_power(title)