|
7 | 7 | import boto3
|
8 | 8 | import json
|
9 | 9 | import concurrent.futures
|
10 |
| -from collections import ChainMap |
| 10 | +from collections import ChainMap, defaultdict |
11 | 11 | from datetime import datetime
|
12 | 12 | from pathlib import Path
|
13 | 13 | from typing import Any, Callable, Dict, List, Optional, Union
|
@@ -201,7 +201,7 @@ def init_s3(self, force=False) -> None:
|
201 | 201 |
|
202 | 202 | def write_object_to_file(self, data, file_path):
|
203 | 203 | try:
|
204 |
| - with open(file_path, 'w') as file: # encoding='utf-8-sig' |
| 204 | + with open(file_path, 'w', encoding='utf8') as file: |
205 | 205 | json.dump(data, file, indent=2, ensure_ascii=False)
|
206 | 206 | except Exception as e:
|
207 | 207 | logging.getLogger().error(f"Error writing to {file_path}: {e}")
|
@@ -339,9 +339,26 @@ def get_files_from_url(self) -> list[str]:
|
339 | 339 | original_key = f"{self.prefix}/{doc_num}" if self.prefix else doc_num
|
340 | 340 |
|
341 | 341 | if self.skip_existing_file:
|
| 342 | + doc_num = item.get('document_id', None) |
| 343 | + url = item.get('url', None) |
| 344 | + if url is None: |
| 345 | + logging.getLogger().error(f"{doc_num} invalid url") |
| 346 | + continue |
| 347 | + filename_with_extension = os.path.basename(url) |
| 348 | + doc_name, file_extension = os.path.splitext(filename_with_extension) |
| 349 | + file_extension = file_extension.lstrip(".") |
| 350 | + file_type = self.get_file_extension(filename_with_extension) |
342 | 351 | extension = file_type if file_type is not None else self.get_file_extension(doc_name)
|
343 | 352 | complete_file_path = f"{temp_dir}/{doc_num}.{extension}"
|
344 | 353 | if os.path.exists(complete_file_path):
|
| 354 | + self.element_ids.add(doc_num) |
| 355 | + doc_nums.append(doc_num) |
| 356 | + self.element_dict[doc_num] = item |
| 357 | + continue |
| 358 | + if not self.is_supported_extension(file_type.lower()): |
| 359 | + skip_count += 1 |
| 360 | + self.skip_dict[doc_num] = item |
| 361 | + logging.getLogger().warning(f"{doc_num} '{doc_name}' with '{file_type}' extension discarded") |
345 | 362 | continue
|
346 | 363 |
|
347 | 364 | if self.skip_storage_download:
|
@@ -376,7 +393,7 @@ def get_files_from_url(self) -> list[str]:
|
376 | 393 | logging.getLogger().error(f"Failed to download file. Status code: {response.status_code}")
|
377 | 394 | continue
|
378 | 395 | except Exception as e:
|
379 |
| - logging.getLogger().error(f"Error downloading {doc_num} '{doc_name}' {e}") |
| 396 | + logging.getLogger().error(f"Error downloading {url} doc:'{doc_num}' name:'{doc_name}' error: {e}") |
380 | 397 | continue
|
381 | 398 | else:
|
382 | 399 | try:
|
@@ -430,14 +447,17 @@ def get_metadata_whitelist_items(self, initial: dict, metadata_whitelist: list[s
|
430 | 447 | return {k: v for k, v in initial.items() if k in metadata_whitelist}
|
431 | 448 |
|
432 | 449 | def save_debug(self, serialized_docs: any, prefix:str) -> str:
|
433 |
| - debug_folder = os.path.join(os.getcwd(), 'debug') |
434 |
| - now = datetime.now() |
435 |
| - formatted_timestamp = now.strftime("%Y%m%d%H%M%S") |
436 |
| - filename = '%s_%s.json' % (prefix, formatted_timestamp) |
437 |
| - file_path = os.path.join(debug_folder, filename) |
438 |
| - with open(file_path, 'w', encoding='utf8') as json_file: |
439 |
| - json.dump(serialized_docs, json_file, ensure_ascii=False, indent=4) |
440 |
| - return file_path |
| 450 | + try: |
| 451 | + debug_folder = os.path.join(os.getcwd(), 'debug') |
| 452 | + now = datetime.now() |
| 453 | + formatted_timestamp = now.strftime("%Y%m%d%H%M%S") |
| 454 | + filename = '%s_%s.json' % (prefix, formatted_timestamp) |
| 455 | + file_path = os.path.join(debug_folder, filename) |
| 456 | + with open(file_path, 'w', encoding='utf8') as json_file: |
| 457 | + json.dump(serialized_docs, json_file, ensure_ascii=False, indent=4) |
| 458 | + return file_path |
| 459 | + except Exception as e: |
| 460 | + logging.getLogger().error(f"Error saving debug file: {e}") |
441 | 461 |
|
442 | 462 | def get_file_extension(self, name) -> str:
|
443 | 463 | '''get extension without the leading dot'''
|
@@ -763,16 +783,24 @@ def augment_metadata(
|
763 | 783 | source_url = f"{self.source_base_url}?{self.source_doc_id}={id}&CONTDISP=INLINE"
|
764 | 784 | initial_metadata['url'] = source_url
|
765 | 785 |
|
| 786 | + url = initial_metadata.get('url', None) |
| 787 | + if url: |
| 788 | + replacements = self.alternative_document_service.get('url_replacements', {}) |
| 789 | + for replacement in replacements: |
| 790 | + old_str = replacement['search'] |
| 791 | + new_str = replacement['replace'] |
| 792 | + if old_str in url: |
| 793 | + url = url.replace(old_str, new_str) |
766 | 794 | self.generate_description(initial_metadata, date_string_description, id)
|
767 | 795 | except Exception as e:
|
768 | 796 | logging.getLogger().error(f"Error augmenting metadata for '{document_name}' from {initial_metadata} Error: {e}")
|
769 | 797 |
|
770 | 798 | return initial_metadata
|
771 | 799 |
|
772 | 800 | def generate_description(self, initial_metadata, date_string_description, id:any):
|
773 |
| - |
| 801 | + default_values = defaultdict(lambda: "", initial_metadata) |
774 | 802 | if not self.description_template is None:
|
775 |
| - description = self.description_template.format(**initial_metadata) |
| 803 | + description = self.description_template.format(**default_values) |
776 | 804 | else:
|
777 | 805 | name = initial_metadata.get('filename', id)
|
778 | 806 | activity = initial_metadata.get('disclosureactivity', '')
|
|
0 commit comments