added audio transcription feature

mosszhd · mosszhd · commit 3dda81870522 · 2024-03-30T03:22:35.000+06:00
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,8 @@ __pycache__/
 .vscode/
 docs
 vectorstore
+sessions
+docs
 
 # environment
 venv
diff --git a/app.py b/app.py
@@ -1,13 +1,36 @@
-import ollama
-import streamlit as st
 import os
-from utils import save_chat_history,load_chat_history_json,get_timestamp
-from datetime import datetime
-from streamlit_mic_recorder import mic_recorder
-from transformers import pipeline
 import yaml
+from datetime import datetime
+
+import ollama
 import torch
+import streamlit as st
+from transformers import pipeline
+from streamlit_mic_recorder import mic_recorder
+from utils import save_chat_history, load_chat_history_json, get_timestamp
+
+from prompt_templates import SYSTEM_PROMPT
 from audio_transcribe import transcribe_audio
+from text_processor import get_document_chunks
+from chromadb_operations import ChromadbOperations
+
+header = st.container()
+header.title("Knowly")
+header.write("""<div class='fixed-header'/>""", unsafe_allow_html=True)
+
+with header:
+    col1, col2 = st.columns(2)
+    with col1:
+        if "model" not in st.session_state:
+            st.session_state["model"] = ""
+        models = [model["name"] for model in ollama.list()["models"]]
+        st.session_state["model"] = st.selectbox("Choose your model", models)
+    with col2:
+        st.write('Record Audio:')
+        voice_recording = mic_recorder(start_prompt="Start recording", stop_prompt="Stop recording", just_once=True)
+        transcribed_audio_prompt = ''
+        if voice_recording:
+            transcribed_audio_prompt = transcribe_audio(voice_recording["bytes"])
 
 with open('config.yaml', 'r') as f:
     config = yaml.safe_load(f)
@@ -32,68 +55,112 @@ def set_session_name(session):
     del st.session_state["messages"]
     st.session_state["messages"] = load_chat_history_json(session)
 
-def model_res_generator():
+def model_res_generator(rag:bool=False):
+    prompt = st.session_state["messages"][-1]["content"]  # extracting last user prompt
+    if rag:
+        context = st.session_state["vector_db"].query(query_text=prompt, k=1)  # fetching similar contexts from vector database
+
+        # creating paragraph of contexts
+        paragraph = ""
+        for i, item in enumerate(context):
+            paragraph += item
+            if i != len(context)-1:
+                paragraph += "\n"
+        
+        # replacing user prompt with augmented prompt
+        st.session_state["messages"][-1]["content"] = formatted_prompt(query=prompt, context=paragraph)
+    
     stream = ollama.chat(
         model=st.session_state["model"],
         messages=st.session_state["messages"],
         stream=True,
     )
+
+    # replacing augmented prompt with actual user prompt
+    if rag:
+        st.session_state["messages"][-1]["content"] = prompt
     for chunk in stream:
         yield chunk["message"]["content"]
 
-st.title("Knowly")
-st.sidebar.title("Chat sessions")
+def formatted_prompt(query:str, context:str):
+    return SYSTEM_PROMPT + f"Question: {query}" + f"\n\nContext: {context}"
+
+def save_session(session_key):
+    if "messages" in st.session_state:
+        if st.session_state.session_key == "new_session":
+            st.session_state.session_key = get_timestamp() + '.json'
+            save_chat_history(st.session_state['messages'], st.session_state.session_key)
+        else:
+            save_chat_history(st.session_state['messages'], st.session_state.session_key)
 
 if "messages" not in st.session_state:
     st.session_state["messages"] = []
 
-if "model" not in st.session_state:
-    st.session_state["model"] = ""
-
 if "session_key" not in st.session_state:
     if len(os.listdir('sessions/')) != 0:
         st.session_state["session_key"] = os.listdir('sessions/')[-1]
         st.session_state["messages"] = load_chat_history_json(st.session_state.session_key)
     else:
         st.session_state["session_key"] = "new_session"
 
-def save_session(session_key):
-    if "messages" in st.session_state:
-        if st.session_state.session_key == "new_session":
-            st.session_state.session_key = get_timestamp() + '.json'
-            save_chat_history(st.session_state['messages'],st.session_state.session_key)
-        else:
-            save_chat_history(st.session_state['messages'],st.session_state.session_key)
-
-models = [model["name"] for model in ollama.list()["models"]]
-st.session_state["model"] = st.selectbox("choose you model", models)
-
 load_chat()
 
-voice_recording = mic_recorder(start_prompt="Start recording", stop_prompt="Stop recording", just_once=True)
-transcribed_audio_prompt = ''
-if voice_recording:
-        transcribed_audio_prompt = transcribe_audio(voice_recording["bytes"])
+with st.sidebar:
+    st.sidebar.write('**Pdf Upload:**')
+    with st.form("my-form", clear_on_submit=True):
+        uploaded_docs = st.file_uploader(label="Upload pdf or text files",
+                                         accept_multiple_files=True,
+                                         key="document_uploader",
+                                         type=["pdf"])
+        submitted = st.form_submit_button("UPLOAD")
+
+    if submitted:
+        print("uploaded docs section is running...")
+        os.makedirs("docs", exist_ok=True)
+        with st.spinner("Processing documents..."):
+            # saving the uploaded files in directory
+            for file_item in uploaded_docs:
+                with open(f"docs/{file_item.name}", "wb") as f:
+                    f.write(file_item.getbuffer())
+                f.close()
         
+        st.session_state["vector_db"] = ChromadbOperations()
+        text_chunks = get_document_chunks(path="docs")
+        st.session_state["vector_db"].insert_data(text_chunks)
+        del st.session_state["document_uploader"]
+
+# pdf chat
+pdf_chat_mode = st.sidebar.toggle(label="PDF Chat",
+                                  key="pdf_chat",
+                                  value=False,
+                                  disabled=True if "vectorstore" not in os.listdir(str(os.getcwd())) else False)
+
+# load the current vector database if exists
+if pdf_chat_mode:
+    if "vector_db" not in st.session_state.keys() and "vectorstore" in os.listdir(str(os.getcwd())):
+        st.session_state["vector_db"] = ChromadbOperations()
+
 user_prompt = st.chat_input("Enter your question:")
 if user_prompt is not None or transcribed_audio_prompt != '':
     if user_prompt:
         prompt = user_prompt
     else:
         prompt = transcribed_audio_prompt
-        
+
     st.session_state["messages"].append({"role" : "user", "content": prompt})
 
     with st.chat_message("user"):
         st.markdown(prompt)
 
     with st.chat_message("assistant"):
-        message = st.write_stream(model_res_generator())
-        st.session_state["messages"].append({"role":"assistant", "content" : message})
+        message = st.write_stream(model_res_generator(rag=pdf_chat_mode))
+        st.session_state["messages"].append({"role": "assistant", "content": message})
 
 save_session(st.session_state.session_key)
 
-st.sidebar.button(label="new chat", on_click=create_new_chat)
+st.sidebar.write('**Chat History:**')
+
+st.sidebar.button(label="New chat", on_click=create_new_chat)
 
 session_list = os.listdir("sessions/")
 for session in session_list:
diff --git a/audio_transcribe.py b/audio_transcribe.py
@@ -1,9 +1,9 @@
-import torch
-from transformers import pipeline
-import librosa
 import io
 import yaml
 
+import torch
+import librosa
+from transformers import pipeline
 
 with open('config.yaml', 'r') as f:
     config = yaml.safe_load(f)
diff --git a/chromadb_operations.py b/chromadb_operations.py
@@ -0,0 +1,36 @@
+import chromadb
+from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2
+import os
+import shutil
+
+
+class ChromadbOperations:
+    def __init__(self):
+        self.client = chromadb.PersistentClient(path=str('vectorstore'))
+        self.collection = self.client.get_or_create_collection(name="text_collection",
+                                                               embedding_function=ONNXMiniLM_L6_V2()) 
+
+    def insert_data(self, texts_chunks):
+        embedding_count = len(self.collection.get()['ids'])
+        if embedding_count == 0:
+            ids = [str(i) for i in range(1, len(texts_chunks)+1)]
+        else:
+            ids = [str(i) for i in range(embedding_count+1, embedding_count+1+len(texts_chunks))]
+        self.collection.add(documents=texts_chunks, ids=ids)
+
+    def count(self):
+        return self.collection.count()
+    
+    def query(self, query_text, k):
+        response = self.collection.query(query_texts=[query_text], n_results=k)
+        return response['documents'][0]
+    
+    def delete_vector_storage(self):
+        if len(self.client.list_collections()) != 0:
+            database_contents = os.listdir(f"{os.getcwd()}/vectorstore")
+            self.client.delete_collection(name="text_collection")
+            for name in database_contents:
+                if os.path.isdir(f"vectorstore/{name}"):
+                    shutil.rmtree(f"vectorstore/{name}")
+                else:
+                    os.remove(f"vectorstore/{name}")
diff --git a/prompt_templates.py b/prompt_templates.py
@@ -0,0 +1,4 @@
+SYSTEM_PROMPT = """Answer the following question only using the context provided, being as concise as possible.
+If you're unsure, just say that you don't know.
+
+"""
diff --git a/style.css b/style.css
@@ -1,19 +1,14 @@
-/* div.element-container.st-emotion-cache-1xanlfj.e1f1d6gn4 {
-    position: fixed;
-    bottom: 8rem;
+[data-testid="column"] {
+    box-shadow: rgb(0 0 0 / 20%) 0px 2px 1px -1px, rgb(0 0 0 / 14%) 0px 1px 1px 0px, rgb(0 0 0 / 12%) 0px 1px 3px 0px;
+    border-radius: 15px;
+    padding: 1% 1% 1% 1%;
 }
 
-button.st-emotion-cache-7ym5gk.ef3psqc12 {
-    position: fixed;
-    bottom: 7rem;
-    right: 300px;
-    
-} */
-
-button.myButton {
-    position: fixed;
-    bottom: 7rem;
-    right: 300px;
-
+div[data-testid="stVerticalBlock"] div:has(div.fixed-header) {
+    position: sticky;
+    border-radius: 15px;
+    background: rgb(101, 105, 109);
+    top: 2.875rem;
+    z-index: 999;
+    text-align: center;
 }
-
diff --git a/text_processor.py b/text_processor.py
@@ -0,0 +1,21 @@
+from langchain_community.document_loaders import DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+def get_document_chunks(path):
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n\n","\n"],
+        chunk_size=2000,
+        chunk_overlap=100,
+        length_function=len
+    )
+
+    all_chunks = []
+
+    # loading all pdf documents at once
+    pdf_loader = DirectoryLoader(path=str(path), glob="**/*.pdf")
+    pdf_documents = pdf_loader.load()
+    for single_chunk in  text_splitter.split_documents(documents=pdf_documents):
+        all_chunks.append(single_chunk.page_content)
+
+    return all_chunks

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +SYSTEM_PROMPT = """Answer the following question only using the context provided, being as concise as possible.
 +If you're unsure, just say that you don't know.
++
 +"""