Skip to content

πŸ” Add Retriever Logic using FAISS Vector Store #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 48 additions & 39 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,55 @@
from typing import Dict

from fastapi import FastAPI, HTTPException, Depends
from fastapi.security import HTTPBasic, HTTPBasicCredentials

from fastapi import FastAPI, Depends, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.security import OAuth2PasswordRequestForm
from app.services.auth_service import authenticate_user, create_access_token, decode_access_token
from app.schemas.token import Token
from fastapi import Request
from app.services.retriever_service import retrieve_similar_docs

app = FastAPI()
security = HTTPBasic()

# Dummy user database
users_db: Dict[str, Dict[str, str]] = {
"Tony": {"password": "password123", "role": "engineering"},
"Bruce": {"password": "securepass", "role": "marketing"},
"Sam": {"password": "financepass", "role": "finance"},
"Peter": {"password": "pete123", "role": "engineering"},
"Sid": {"password": "sidpass123", "role": "marketing"},
"Natasha": {"passwoed": "hrpass123", "role": "hr"}
}
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], allow_credentials=True,
allow_methods=["*"], allow_headers=["*"]
)


# Authentication dependency
def authenticate(credentials: HTTPBasicCredentials = Depends(security)):
username = credentials.username
password = credentials.password
user = users_db.get(username)
if not user or user["password"] != password:
@app.post("/login", response_model=Token)
def login(form_data: OAuth2PasswordRequestForm = Depends()):
user = authenticate_user(form_data.username, form_data.password)
if not user:
raise HTTPException(status_code=401, detail="Invalid credentials")
return {"username": username, "role": user["role"]}


# Login endpoint
@app.get("/login")
def login(user=Depends(authenticate)):
return {"message": f"Welcome {user['username']}!", "role": user["role"]}


# Protected test endpoint
@app.get("/test")
def test(user=Depends(authenticate)):
return {"message": f"Hello {user['username']}! You can now chat.", "role": user["role"]}


token = create_access_token({
"sub": user["username"],
"role": user["role"]
})
return {"access_token": token, "token_type": "bearer"}

def get_current_user(request: Request):
auth_header = request.headers.get("Authorization")
if not auth_header or not auth_header.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing or invalid token")

token = auth_header.split(" ")[1]
user_data = decode_access_token(token)
if not user_data:
raise HTTPException(status_code=401, detail="Invalid token")

return user_data

# Protected chat endpoint
@app.post("/chat")
def query(user=Depends(authenticate), message: str = "Hello"):
return "Implement this endpoint."
def chat(query: str, user=Depends(get_current_user)):
if user["role"] not in ["engineering", "hr", "finance", "marketing"]:
raise HTTPException(status_code=403, detail="Access denied for your role.")

# Dummy role-based logic for now
return {
"response": f"Hi {user['username']}, you asked: '{query}' (role: {user['role']})"
}


@app.get("/test-retrieve")
def test_retrieve(query: str, request: Request):
user = get_current_user(request)
return retrieve_similar_docs(query, user["role"])
5 changes: 5 additions & 0 deletions app/schemas/token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from pydantic import BaseModel

class Token(BaseModel):
access_token: str
token_type: str
5 changes: 5 additions & 0 deletions app/schemas/user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from pydantic import BaseModel

class UserLogin(BaseModel):
username: str
password: str
30 changes: 30 additions & 0 deletions app/services/auth_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
from datetime import datetime, timedelta, timezone
from jose import jwt, JWTError
from dotenv import load_dotenv
from app.utils.user_db import users_db

load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))

SECRET_KEY = os.getenv("JWT_SECRET")
ALGORITHM = os.getenv("JWT_ALGORITHM")
EXPIRATION = int(os.getenv("JWT_EXPIRATION_SECONDS"))

def authenticate_user(username: str, password: str):
user = users_db.get(username)
if user and user["password"] == password:
return {"username": username, "role": user["role"]}
return None

def create_access_token(data: dict):
to_encode = data.copy()
expire = datetime.now(timezone.utc) + timedelta(seconds=EXPIRATION)
to_encode.update({"exp": expire})
return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)

def decode_access_token(token: str):
try:
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
return {"username": payload.get("sub"), "role": payload.get("role")}
except JWTError:
return None
Empty file added app/services/llm_service.py
Empty file.
32 changes: 32 additions & 0 deletions app/services/retriever_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import pickle
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

VECTOR_DIR = "vector_data"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


def load_faiss_index(role: str):
index_path = os.path.join(VECTOR_DIR, f"{role}_index.faiss")
docs_path = os.path.join(VECTOR_DIR, f"{role}_docs.pkl")

if not os.path.exists(index_path) or not os.path.exists(docs_path):
raise ValueError(f"No vector index found for role: {role}")

index = faiss.read_index(index_path)
with open(docs_path, "rb") as f:
documents = pickle.load(f)

return index, documents


def retrieve_similar_docs(query: str, role: str, top_k: int = 3) -> list[str]:
index, documents = load_faiss_index(role)
query_vector = embedding_model.encode([query])

distances, indices = index.search(query_vector, top_k)
results = [documents[i] for i in indices[0] if i < len(documents)]

return results
65 changes: 65 additions & 0 deletions app/services/vector_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from sentence_transformers import SentenceTransformer
import faiss
import os
import pickle
import pandas as pd

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

VECTOR_DIR = "vector_data"
RESOURCE_DIR = "resources/data"


def load_documents(role: str) -> list[str]:
role_path = os.path.join(RESOURCE_DIR, role)
documents = []

if not os.path.exists(role_path):
print(f"⚠️ Directory for role '{role}' does not exist: {role_path}")
return []

for filename in os.listdir(role_path):
file_path = os.path.join(role_path, filename)

try:
if filename.endswith(".csv"):
df = pd.read_csv(file_path)
for col in df.select_dtypes(include=[object]):
documents.extend(df[col].dropna().astype(str).tolist())

elif filename.endswith(".txt") or filename.endswith(".md"):
with open(file_path, "r", encoding="utf-8") as f:
text = f.read().strip()
if text:
documents.append(text)

except Exception as e:
print(f"⚠️ Failed to read {file_path}: {e}")

print(f"βœ… Loaded {len(documents)} documents for role: {role}")
return documents


def build_faiss_index(role: str, documents: list[str]):
if not documents:
print(f"⚠️ Skipping {role}, no documents found.")
return

vectors = embedding_model.encode(documents, show_progress_bar=True)
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

os.makedirs(VECTOR_DIR, exist_ok=True)
faiss.write_index(index, f"{VECTOR_DIR}/{role}_index.faiss")

with open(f"{VECTOR_DIR}/{role}_docs.pkl", "wb") as f:
pickle.dump(documents, f)

print(f"βœ… FAISS index built and saved for role: {role}")


if __name__ == "__main__":
roles = ["general", "marketing", "engineering", "finance", "hr"]
for role in roles:
docs = load_documents(role)
build_faiss_index(role, docs)
8 changes: 8 additions & 0 deletions app/utils/user_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
users_db = {
"Tony": {"password": "password123", "role": "engineering"},
"Bruce": {"password": "securepass", "role": "marketing"},
"Sam": {"password": "financepass", "role": "finance"},
"Peter": {"password": "pete123", "role": "engineering"},
"Sid": {"password": "sidpass123", "role": "marketing"},
"Natasha": {"password": "hrpass123", "role": "hr"},
}
17 changes: 16 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
[project]
name = "ds-rpc-01"
version = "0.1.0"
description = "Starter project for the RPC-01: Internal Chatbot with Role Based Access Control"
description = "RAG-based chatbot with role-based access control"
readme = "README.md"
requires-python = ">=3.10"

dependencies = [
"fastapi[standard]>=0.115.12",
"uvicorn>=0.22.0",
"python-jose[cryptography]>=3.3.0",
"python-dotenv>=1.0.0",
"sentence-transformers>=2.2.2",
"faiss-cpu>=1.7.4",
"pandas>=2.2.2"
]


[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[tool.setuptools]
packages = ["app"]
Binary file added vector_data/engineering_docs.pkl
Binary file not shown.
Binary file added vector_data/engineering_index.faiss
Binary file not shown.
Binary file added vector_data/finance_docs.pkl
Binary file not shown.
Binary file added vector_data/finance_index.faiss
Binary file not shown.
Binary file added vector_data/general_docs.pkl
Binary file not shown.
Binary file added vector_data/general_index.faiss
Binary file not shown.
Binary file added vector_data/hr_docs.pkl
Binary file not shown.
Binary file added vector_data/hr_index.faiss
Binary file not shown.
Binary file added vector_data/marketing_docs.pkl
Binary file not shown.
Binary file added vector_data/marketing_index.faiss
Binary file not shown.