Skip to content

Commit 5722e0b

Browse files
committed
add replacements section for URL
1 parent 6bb7d8b commit 5722e0b

File tree

4 files changed

+52
-18
lines changed

4 files changed

+52
-18
lines changed

amazon_s3/s3reader.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import boto3
88
import json
99
import concurrent.futures
10-
from collections import ChainMap
10+
from collections import ChainMap, defaultdict
1111
from datetime import datetime
1212
from pathlib import Path
1313
from typing import Any, Callable, Dict, List, Optional, Union
@@ -201,7 +201,7 @@ def init_s3(self, force=False) -> None:
201201

202202
def write_object_to_file(self, data, file_path):
203203
try:
204-
with open(file_path, 'w') as file: # encoding='utf-8-sig'
204+
with open(file_path, 'w', encoding='utf8') as file:
205205
json.dump(data, file, indent=2, ensure_ascii=False)
206206
except Exception as e:
207207
logging.getLogger().error(f"Error writing to {file_path}: {e}")
@@ -339,9 +339,26 @@ def get_files_from_url(self) -> list[str]:
339339
original_key = f"{self.prefix}/{doc_num}" if self.prefix else doc_num
340340

341341
if self.skip_existing_file:
342+
doc_num = item.get('document_id', None)
343+
url = item.get('url', None)
344+
if url is None:
345+
logging.getLogger().error(f"{doc_num} invalid url")
346+
continue
347+
filename_with_extension = os.path.basename(url)
348+
doc_name, file_extension = os.path.splitext(filename_with_extension)
349+
file_extension = file_extension.lstrip(".")
350+
file_type = self.get_file_extension(filename_with_extension)
342351
extension = file_type if file_type is not None else self.get_file_extension(doc_name)
343352
complete_file_path = f"{temp_dir}/{doc_num}.{extension}"
344353
if os.path.exists(complete_file_path):
354+
self.element_ids.add(doc_num)
355+
doc_nums.append(doc_num)
356+
self.element_dict[doc_num] = item
357+
continue
358+
if not self.is_supported_extension(file_type.lower()):
359+
skip_count += 1
360+
self.skip_dict[doc_num] = item
361+
logging.getLogger().warning(f"{doc_num} '{doc_name}' with '{file_type}' extension discarded")
345362
continue
346363

347364
if self.skip_storage_download:
@@ -376,7 +393,7 @@ def get_files_from_url(self) -> list[str]:
376393
logging.getLogger().error(f"Failed to download file. Status code: {response.status_code}")
377394
continue
378395
except Exception as e:
379-
logging.getLogger().error(f"Error downloading {doc_num} '{doc_name}' {e}")
396+
logging.getLogger().error(f"Error downloading {url} doc:'{doc_num}' name:'{doc_name}' error: {e}")
380397
continue
381398
else:
382399
try:
@@ -430,14 +447,17 @@ def get_metadata_whitelist_items(self, initial: dict, metadata_whitelist: list[s
430447
return {k: v for k, v in initial.items() if k in metadata_whitelist}
431448

432449
def save_debug(self, serialized_docs: any, prefix:str) -> str:
433-
debug_folder = os.path.join(os.getcwd(), 'debug')
434-
now = datetime.now()
435-
formatted_timestamp = now.strftime("%Y%m%d%H%M%S")
436-
filename = '%s_%s.json' % (prefix, formatted_timestamp)
437-
file_path = os.path.join(debug_folder, filename)
438-
with open(file_path, 'w', encoding='utf8') as json_file:
439-
json.dump(serialized_docs, json_file, ensure_ascii=False, indent=4)
440-
return file_path
450+
try:
451+
debug_folder = os.path.join(os.getcwd(), 'debug')
452+
now = datetime.now()
453+
formatted_timestamp = now.strftime("%Y%m%d%H%M%S")
454+
filename = '%s_%s.json' % (prefix, formatted_timestamp)
455+
file_path = os.path.join(debug_folder, filename)
456+
with open(file_path, 'w', encoding='utf8') as json_file:
457+
json.dump(serialized_docs, json_file, ensure_ascii=False, indent=4)
458+
return file_path
459+
except Exception as e:
460+
logging.getLogger().error(f"Error saving debug file: {e}")
441461

442462
def get_file_extension(self, name) -> str:
443463
'''get extension without the leading dot'''
@@ -763,16 +783,24 @@ def augment_metadata(
763783
source_url = f"{self.source_base_url}?{self.source_doc_id}={id}&CONTDISP=INLINE"
764784
initial_metadata['url'] = source_url
765785

786+
url = initial_metadata.get('url', None)
787+
if url:
788+
replacements = self.alternative_document_service.get('url_replacements', {})
789+
for replacement in replacements:
790+
old_str = replacement['search']
791+
new_str = replacement['replace']
792+
if old_str in url:
793+
url = url.replace(old_str, new_str)
766794
self.generate_description(initial_metadata, date_string_description, id)
767795
except Exception as e:
768796
logging.getLogger().error(f"Error augmenting metadata for '{document_name}' from {initial_metadata} Error: {e}")
769797

770798
return initial_metadata
771799

772800
def generate_description(self, initial_metadata, date_string_description, id:any):
773-
801+
default_values = defaultdict(lambda: "", initial_metadata)
774802
if not self.description_template is None:
775-
description = self.description_template.format(**initial_metadata)
803+
description = self.description_template.format(**default_values)
776804
else:
777805
name = initial_metadata.get('filename', id)
778806
activity = initial_metadata.get('disclosureactivity', '')

saia_ingest/file_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def load_hashes_from_json(folder: Path, id:str='documentid') -> Dict[str, Any]:
3838
duplicate_count = 0
3939
for json_file in folder.glob("*.json"):
4040
try:
41-
with json_file.open('r') as f:
41+
with json_file.open('r', encoding='utf-8') as f:
4242
data = json.load(f)
4343
if Defaults.FILE_HASH in data:
4444
file_hash = data[Defaults.FILE_HASH]

saia_ingest/ingestor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from datetime import datetime, timezone
55
import json
66
import concurrent.futures
7+
import traceback
78

89
#from llama_index import QueryBundle
910
#from llama_index.retrievers import BaseRetriever
@@ -636,7 +637,8 @@ def ingest_s3(
636637
ret = ingest(documents, openapi_key, index_name, namespace, embeddings_model)
637638

638639
except Exception as e:
639-
logging.getLogger().error(f"Error: {e}")
640+
logging.getLogger().error(f"Ingest Error: {e}")
641+
traceback.print_exc()
640642
ret = False
641643
finally:
642644
end_time = time.time()

saia_ingest/rag_api.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,13 @@ def upload_document_binary(self, file_path, content_type = 'application/pdf', pr
244244
"Content-Type": content_type
245245
}
246246
headers.update(self.base_header)
247-
with open(file_path, 'rb') as file:
248-
response = self._do_request(POST_METHOD, url, headers=headers, data=file)
249-
return response.json()
247+
try:
248+
with open(file_path, 'rb') as file:
249+
response = self._do_request(POST_METHOD, url, headers=headers, data=file)
250+
return response.json()
251+
except Exception as e:
252+
logging.getLogger().info(f"Invalid upload {file_path} {e}")
253+
raise e
250254

251255
def _is_valid_json(self, my_json):
252256
try:

0 commit comments

Comments
 (0)