forked from danny-avila/rag_api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsers.py
29 lines (23 loc) · 937 Bytes
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from typing import List, Optional
from langchain.schema import Document
from config import CHUNK_OVERLAP
def process_documents(documents: List[Document]) -> str:
processed_text = ""
last_page: Optional[int] = None
doc_basename = ""
for doc in documents:
if 'source' in doc.metadata:
doc_basename = doc.metadata['source'].split('/')[-1]
break
processed_text += f"{doc_basename}\n"
for doc in documents:
current_page = doc.metadata.get('page')
if current_page and current_page != last_page:
processed_text += f"\n# PAGE {doc.metadata['page']}\n\n"
last_page = current_page
new_content = doc.page_content
if processed_text.endswith(new_content[:CHUNK_OVERLAP]):
processed_text += new_content[CHUNK_OVERLAP:]
else:
processed_text += new_content
return processed_text.strip()