bhaskatripathi · jacobyoby · May 4, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "Python 3",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:0-3.11"
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "pip3 install --user -r requirements.txt",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.vscode
+.devcontainer
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+## MIT License
+
+**Copyright (c) [2023] [multiple]**
+
+Permission is hereby granted, free of charge, to any person obtaining a copy  
+of this software and associated documentation files (the "Software"), to deal  
+in the Software without restriction, including without limitation the rights  
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  
+copies of the Software, and to permit persons to whom the Software is  
+furnished to do so, subject to the following conditions:  
+
+The above copyright notice and this permission notice shall be included in all  
+copies or substantial portions of the Software.  
+
+**THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  
+SOFTWARE.**
diff --git a/README.md b/README.md
@@ -1,42 +1,44 @@
+# pdfGPT: Improved PDF Text Chatbot using Universal Sentence Encoder and Open AI
 
-# pdfGPT
-### Problem Description : 
-1. When you pass a large text to Open AI, it suffers from a 4K token limit. It cannot take an entire pdf file as an input
-2. Open AI sometimes becomes overtly chatty and returns irrelevant response not directly related to your query. This is because Open AI uses poor embeddings.
-3. ChatGPT cannot directly talk to external data. Some solutions use Langchain but it is token hungry if not implemented correctly.
-4. There are a number of solutions like https://www.chatpdf.com, https://www.bespacific.com/chat-with-any-pdf, https://www.filechat.io they have poor content quality and are prone to hallucination problem. One good way to avoid hallucinations and improve truthfulness is to use improved embeddings. To solve this problem, I propose to improve embeddings with Universal Sentence Encoder family of algorithms (Read more here: https://tfhub.dev/google/collections/universal-sentence-encoder/1). 
-
-### Solution: What is PDF GPT ?
-1. PDF GPT allows you to chat with an uploaded PDF file using GPT functionalities.
-2. The application intelligently breaks the document into smaller chunks and employs a powerful Deep Averaging Network Encoder to generate embeddings.
-3. A semantic search is first performed on your pdf content and the most relevant embeddings are passed to the Open AI.
-4. A custom logic generates precise responses. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly. The Responses are much better than the naive responses by Open AI.
-5. Andrej Karpathy mentioned in this post that KNN algorithm is most appropriate for similar problems: https://twitter.com/karpathy/status/1647025230546886658
+pdfGPT is a solution that allows users to chat with an uploaded PDF file using GPT functionalities. It solves many problems associated with the use of Open AI for text analysis.
+
+## Problem Description
+1. Limitations of Open AI due to 4K token limit, which does not allow it to take an entire pdf file as input.
+2. Open AI sometimes becomes overtly chatty and returns irrelevant responses not related to user queries due to poor embeddings.
+3. ChatGPT is unable to talk directly to external data without using token-hungry LangChain implementations.
+4. Poor quality and hallucinations in solutions like https://www.chatpdf.com, https://www.bespacific.com/chat-with-any-pdf, and https://www.filechat.io. Improved embeddings can be used to avoid such issues.
+
+To solve these problems, improved embeddings are generated with the Universal Sentence Encoder family of algorithms (Read more here: https://tfhub.dev/google/collections/universal-sentence-encoder/1).
+
+## Solution
+1. The application intelligently breaks down documents into smaller chunks and employs a powerful Deep Averaging Network Encoder to generate embeddings.
+2. A semantic search is performed on your PDF content, and the most relevant embeddings are passed to Open AI.
+3. A custom logic generates precise responses. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly.
+4. The responses are much better than the naive responses generated by Open AI.
+5. According to a tweet by Andrej Karpathy (https://twitter.com/karpathy/status/1647025230546886658), KNN algorithm is most appropriate for similar problems.
 6. Enables APIs on Production using **[langchain-serve](https://github.com/jina-ai/langchain-serve)**.
 
 ### Demo
 1. **Demo URL**: https://bit.ly/41ZXBJM
 2. **Original Source code** (for demo hosted in Hugging Face) : https://huggingface.co/spaces/bhaskartripathi/pdfChatter/blob/main/app.py
 
-**NOTE**: Please star this project if you like it!
+
+Please star this project if you like it!
 
 ### Docker
 Run `docker-compose -f docker-compose.yaml up` to use it with Docker compose.
 
 
 ## Use `pdfGPT` on Production using [langchain-serve](https://github.com/jina-ai/langchain-serve)
-
-#### Local playground
+### Local playground
 1. Run `lc-serve deploy local api` on one terminal to expose the app as API using langchain-serve.
 2. Run `python app.py` on another terminal for a local gradio playground.
 3. Open `http://localhost:7860` on your browser and interact with the app.
 
+### Cloud Deployment
+Make `pdfGPT` production-ready by deploying it on [Jina Cloud](https://cloud.jina.ai/).
 
-#### Cloud deployment
-
-Make `pdfGPT` production ready by deploying it on [Jina Cloud](https://cloud.jina.ai/).
-
-`lc-serve deploy jcloud api` 
+Run `lc-serve deploy jcloud api` 
 
 <details>
 <summary>Show command output</summary>
@@ -59,7 +61,7 @@ Make `pdfGPT` production ready by deploying it on [Jina Cloud](https://cloud.jin
 
 </details>
 
-#### Interact using cURL
+### Interact using cURL
 
 (Change the URL to your own endpoint)
 
@@ -149,7 +151,7 @@ sequenceDiagram
     System-->>User: Return Answer
 ```
 
-### Flowchart
+## Flowchart
 ```mermaid
 flowchart TB
 A[Input] --> B[URL]
@@ -162,13 +164,11 @@ G -- K-Nearest Neighbour --> K[Get Nearest Neighbour - matching citation referen
 K -- Generate Prompt --> H[Generate Answer]
 H -- Output --> I[Output]
 ```
+
 ## Star History
 
 [![Star History Chart](https://api.star-history.com/svg?repos=bhaskatripathi/pdfGPT&type=Date)](https://star-history.com/#bhaskatripathi/pdfGPT&Date)
-I am looking for more contributors from the open source community who can take up backlog items voluntarily and maintain the application jointly with me.
-
-## Also Try:
-This app creates schematic architecture diagrams, UML, flowcharts, Gantt charts and many more. You simple need to mention the usecase in natural language and it will create the desired diagram.
-https://github.com/bhaskatripathi/Text2Diagram
 
+I am looking for more contributors from the open source community who can take up backlog items voluntarily and maintain the application jointly with me.
 
+Also try [Text2Diagram](https://github.com/bhaskatripathi/Text2Diagram), an app that creates schematic architecture diagrams, UML, flowcharts, Gantt charts, using natural language input.
diff --git a/api.py b/api.py
@@ -1,10 +1,10 @@
+# import libraries
 import os
 import re
 import shutil
 import urllib.request
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-
 import fitz
 import numpy as np
 import openai
@@ -13,6 +13,8 @@
 from lcserve import serving
 from sklearn.neighbors import NearestNeighbors
 
+# download pdf from given url
+
 
 recommender = None
 
@@ -21,21 +23,29 @@ def download_pdf(url, output_path):
     urllib.request.urlretrieve(url, output_path)
 
 
+# preprocess text
+
+
 def preprocess(text):
-    text = text.replace('\n', ' ')
-    text = re.sub('\s+', ' ', text)
+    text = text.replace("\n", " ")
+    text = re.sub("\s+", " ", text)
     return text
 
 
+# convert pdf to text list
+
+
 def pdf_to_text(path, start_page=1, end_page=None):
     doc = fitz.open(path)
     total_pages = doc.page_count
 
+    # if end page is not specified set it to total pages
     if end_page is None:
         end_page = total_pages
 
     text_list = []
 
+    # loop through all the pages and get the text
     for i in range(start_page - 1, end_page):
         text = doc.load_page(i).get_text("text")
         text = preprocess(text)
@@ -45,32 +55,41 @@ def pdf_to_text(path, start_page=1, end_page=None):
     return text_list
 
 
+# convert text list to chunks of words with page numbers
+
+
 def text_to_chunks(texts, word_length=150, start_page=1):
-    text_toks = [t.split(' ') for t in texts]
+    text_toks = [t.split(" ") for t in texts]
     page_nums = []
     chunks = []
 
+    # loop through each word and create chunks
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
             chunk = words[i : i + word_length]
+            # if last chunk is smaller than word length and not last page then add it to next page
             if (
                 (i + word_length) > len(words)
                 and (len(chunk) < word_length)
                 and (len(text_toks) != (idx + 1))
             ):
                 text_toks[idx + 1] = chunk + text_toks[idx + 1]
                 continue
-            chunk = ' '.join(chunk).strip()
-            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
+            chunk = " ".join(chunk).strip()
+            chunk = f'[Page no. {idx + start_page}] "{chunk}"'
             chunks.append(chunk)
     return chunks
 
 
+# semantic search class
+
+
 class SemanticSearch:
     def __init__(self):
-        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+        self.use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
         self.fitted = False
 
+    # fit the data
     def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -79,23 +98,24 @@ def fit(self, data, batch=1000, n_neighbors=5):
         self.nn.fit(self.embeddings)
         self.fitted = True
 
+    # call the model
     def __call__(self, text, return_data=True):
         inp_emb = self.use([text])
         neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
 
-        if return_data:
-            return [self.data[i] for i in neighbors]
-        else:
-            return neighbors
+        return [self.data[i] for i in neighbors] if return_data else neighbors
 
+    # get text embedding
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):
             text_batch = texts[i : (i + batch)]
             emb_batch = self.use(text_batch)
             embeddings.append(emb_batch)
-        embeddings = np.vstack(embeddings)
-        return embeddings
+        return np.vstack(embeddings)
+
+
+# load recommender
 
 
 def load_recommender(path, start_page=1):
@@ -106,7 +126,10 @@ def load_recommender(path, start_page=1):
     texts = pdf_to_text(path, start_page=start_page)
     chunks = text_to_chunks(texts, start_page=start_page)
     recommender.fit(chunks)
-    return 'Corpus Loaded.'
+    return "Corpus Loaded."
+
+
+# generate text using openAI
 
 
 def generate_text(openAI_key, prompt, engine="text-davinci-003"):
@@ -119,16 +142,17 @@ def generate_text(openAI_key, prompt, engine="text-davinci-003"):
         stop=None,
         temperature=0.7,
     )
-    message = completions.choices[0].text
-    return message
+    return completions.choices[0].text
+
+
+# generate answer for a given question
 
 
 def generate_answer(question, openAI_key):
     topn_chunks = recommender(question)
-    prompt = ""
-    prompt += 'search results:\n\n'
+    prompt = "" + "search results:\n\n"
     for c in topn_chunks:
-        prompt += c + '\n\n'
+        prompt += c + "\n\n"
 
     prompt += (
         "Instructions: Compose a comprehensive reply to the query using the search results given. "
@@ -142,8 +166,15 @@ def generate_answer(question, openAI_key):
     )
 
     prompt += f"Query: {question}\nAnswer:"
-    answer = generate_text(openAI_key, prompt, "text-davinci-003")
-    return answer
+    return generate_text(openAI_key, prompt, "text-davinci-003")
+
+
+
+# global instance of semantic search
+recommender = SemanticSearch()
+
+# load openAI key
+
 
 
 def load_openai_key() -> str:
@@ -155,14 +186,20 @@ def load_openai_key() -> str:
     return key
 
 
+# ask url
+
+
 @serving
 def ask_url(url: str, question: str):
-    download_pdf(url, 'corpus.pdf')
-    load_recommender('corpus.pdf')
+    download_pdf(url, "corpus.pdf")
+    load_recommender("corpus.pdf")
     openAI_key = load_openai_key()
     return generate_answer(question, openAI_key)
 
 
+# ask file
+
+
 @serving
 async def ask_file(file: UploadFile, question: str) -> str:
     suffix = Path(file.filename).suffix