Skip to content

Commit

Permalink
feat: filename - atualiza lógica de nomeação de arquivo baixado
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju committed Feb 3, 2025
1 parent f606ea7 commit ee16282
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions data_collection/gazette/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,18 @@ def file_path(self, request, response=None, info=None, item=None):
)
# The default path from the scrapy class begins with "full/". In this
# class we replace that with the public_entity_id and gazette date.
filename = filepath.name
file_extension = filepath.suffix

if response is not None and not filepath.suffix:
filename = self._get_filename_with_extension(filename, response)
if response is not None and not file_extension:
file_extension = self._get_file_extension(response)

filename = (
f'{item["date"]}_{item["spider_name"]}_{filepath.stem}{file_extension}'
)

return str(Path(item["public_entity_id"], item["date"], filename))

def _get_filename_with_extension(self, filename, response):
def _get_file_extension(self, response):
# The majority of the Gazettes are PDF files, so we can check it
# faster validating document Content-Type before using a more costly
# check with filetype library
Expand All @@ -202,4 +206,4 @@ def _get_filename_with_extension(self, filename, response):
file_kind = filetype.guess(response.body[:max_file_header_size])
file_extension = f".{file_kind.extension}" if file_kind is not None else ""

return f"{filename}{file_extension}"
return file_extension

0 comments on commit ee16282

Please sign in to comment.