Skip to content

community: Handle classification failure in PebbloSafeLoader and PebbloRetrievalQA #63

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
AuthContext,
SemanticContext,
)
from langchain_community.utilities.pebblo import CLASSIFICATION_UNAVAILABLE

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -522,10 +523,28 @@ def _set_semantic_enforcement_filter(
This method sets the semantic enforcement filter in the search_kwargs
of the retriever based on the type of the vectorstore.
"""
# Add CLASSIFICATION_UNAVAILABLE to deny list if it's not empty
add_unavailable_to_deny_list(semantic_context)
# Apply semantic filter
search_kwargs = retriever.search_kwargs
if retriever.vectorstore.__class__.__name__ == PINECONE:
_apply_pinecone_semantic_filter(search_kwargs, semantic_context)
elif retriever.vectorstore.__class__.__name__ == QDRANT:
_apply_qdrant_semantic_filter(search_kwargs, semantic_context)
elif retriever.vectorstore.__class__.__name__ == PGVECTOR:
_apply_pgvector_semantic_filter(search_kwargs, semantic_context)


def add_unavailable_to_deny_list(sem_ctx: Optional[SemanticContext]) -> None:
"""
Add CLASSIFICATION_UNAVAILABLE to deny list if it's not empty.
This function handles documents with missing semantic metadata.
"""
if sem_ctx is None:
return
if sem_ctx.pebblo_semantic_entities and sem_ctx.pebblo_semantic_entities.deny:
if CLASSIFICATION_UNAVAILABLE not in sem_ctx.pebblo_semantic_entities.deny:
sem_ctx.pebblo_semantic_entities.deny.append(CLASSIFICATION_UNAVAILABLE)
if sem_ctx.pebblo_semantic_topics and sem_ctx.pebblo_semantic_topics.deny:
if CLASSIFICATION_UNAVAILABLE not in sem_ctx.pebblo_semantic_topics.deny:
sem_ctx.pebblo_semantic_topics.deny.append(CLASSIFICATION_UNAVAILABLE)
31 changes: 21 additions & 10 deletions libs/community/langchain_community/document_loaders/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.utilities.pebblo import (
APP_DISCOVER_URL,
CLASSIFICATION_UNAVAILABLE,
CLASSIFIER_URL,
LOADER_DOC_URL,
PEBBLO_CLOUD_URL,
Expand Down Expand Up @@ -481,10 +482,15 @@ def _add_semantic_to_docs(self, classified_docs: Dict) -> List[Document]:
for doc in self.docs_with_id
}

for classified_doc in classified_docs.values():
doc_id = classified_doc.get("pb_id")
if doc_id in indexed_docs:
self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)
if not classified_docs:
# Add CLASSIFICATION_UNAVAILABLE to semantic metadata if no classification
for _, doc in indexed_docs.items():
self._add_semantic_to_doc(doc, {})
else:
for classified_doc in classified_docs.values():
doc_id = classified_doc.get("pb_id")
if doc_id in indexed_docs:
self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)

semantic_metadata_docs = [doc for doc in indexed_docs.values()]

Expand All @@ -506,6 +512,7 @@ def _unindex_docs(self) -> List[Document]:
def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document:
"""
Adds semantic metadata to the given document in-place.
If classified_doc is empty, adds "unavailable" to semantic data.

Args:
doc (Document): A Document object.
Expand All @@ -514,12 +521,16 @@ def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document:
Returns:
Document: The Document object with added semantic metadata.
"""
doc.metadata["pebblo_semantic_entities"] = list(
classified_doc.get("entities", {}).keys()
)
doc.metadata["pebblo_semantic_topics"] = list(
classified_doc.get("topics", {}).keys()
)
if classified_doc:
doc.metadata["pebblo_semantic_entities"] = list(
classified_doc.get("entities", {}).keys()
)
doc.metadata["pebblo_semantic_topics"] = list(
classified_doc.get("topics", {}).keys()
)
else:
doc.metadata["pebblo_semantic_entities"] = [CLASSIFICATION_UNAVAILABLE]
doc.metadata["pebblo_semantic_topics"] = [CLASSIFICATION_UNAVAILABLE]
return doc

def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None:
Expand Down
1 change: 1 addition & 0 deletions libs/community/langchain_community/utilities/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
}

SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
CLASSIFICATION_UNAVAILABLE = "unavailable"

logger = logging.getLogger(__name__)

Expand Down
Loading