feat: store full pages

rti · rti · commit 08fc1b1b1027 · 2024-04-19T06:15:07.000Z
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .venv
 .direnv
 __pycache__
+*.xml
diff --git a/Dockerfile b/Dockerfile
@@ -4,5 +4,7 @@ WORKDIR /workspace
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 
-ENTRYPOINT [ "python" ]
-CMD [ "main.py" ]
+# ENTRYPOINT [ "python" ]
+# CMD [ "main.py" ]
+
+CMD ["sleep", "infinity"]
diff --git a/chunker.py b/chunker.py
@@ -0,0 +1,38 @@
+def chunk(s, chunkSize=256, overlap=64):
+    words = s.split(" ")
+    return [" ".join(words[i:i+chunkSize]) for i in range(0, len(words), chunkSize - overlap)]
+
+if __name__ == "__main__":
+    cases = [
+        (
+            "",
+            [""],
+        ),
+        (
+            "hi",
+            ["hi"],
+        ),
+        (
+            "this is a test",
+            ["this is a test"],
+        ),
+        (
+            "this is a long test with more than ten words so that we can test overlap",
+            [
+                "this is a long test with more than ten words",
+                "ten words so that we can test overlap",
+            ],
+        ),
+    ]
+
+
+    print("Testing chunk function.")
+    for case in cases:
+        input = case[0]
+        expected = case[1]
+        actual = chunk(input, 10, 2)
+        print("\nInput: %s \nExpected: %s\nActual:   %s" % (str(input), str(expected), str(actual)))
+        assert actual == expected, "%s != %s" % (actual, expected)
+
+    print('\nAll tests passed.')
+
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -12,12 +12,12 @@ services:
       - ollama-models:/var/lib/ollama/models
       - ollama-config:/root/.ollama # stores private key
 
-    # GPU access for ROCm (AMD GPUs)
+    # GPU access for ROCm (AMD)
     devices:
       - /dev/dri:/dev/dri
-      - /dev/kfd:/dev/kfd
+      # - /dev/kfd:/dev/kfd
 
-    # GPU access for CUDA (NVIDIA)
+    # GPU access for CUDA (NVIDIA) - untested
     # deploy:
     #   resources:
     #     reservations:
diff --git a/import_dump.py b/import_dump.py
@@ -3,110 +3,96 @@
 import tqdm
 import re
 
-import ml
+import models
 import postgres
+import chunker
+
 
 def strip_wikitext(s):
-    HTML_FILTERS  = {
-        'div': ['navbox','navbox-styles','spoken-wikipedia', 'noprint', 'hatnote', 'rt-tooltip', 'reflist'],
-        'span': ['mw-ext-cite-error'],
-        'table': ['noprint','ombox'],
-        'ol': ['breadcrumb-nav-container', 'references'],
-        'sup': ['reference']
-    }
-    REGEX_FILTERS = {
-        'p': '→.*ersion'
+    HTML_FILTERS = {
+        "div": [
+            "navbox",
+            "navbox-styles",
+            "spoken-wikipedia",
+            "noprint",
+            "hatnote",
+            "rt-tooltip",
+            "reflist",
+        ],
+        "span": ["mw-ext-cite-error"],
+        "table": ["noprint", "ombox"],
+        "ol": ["breadcrumb-nav-container", "references"],
+        "sup": ["reference"],
     }
+    REGEX_FILTERS = {"p": "→.*ersion"}
 
     def filterHtml(soup):
-        for figure in soup.find_all('figure'):
+        for figure in soup.find_all("figure"):
             figure.decompose()
 
         for tag, classes in HTML_FILTERS.items():
             for className in classes:
-                for div in soup.find_all(tag, {'class': className}):
+                for div in soup.find_all(tag, {"class": className}):
                     div.decompose()
 
         for tag, regex in REGEX_FILTERS.items():
             for element in soup.find_all(tag):
-                if(re.search(regex, str(element)) != None):
+                if re.search(regex, str(element)) != None:
                     element.decompose()
 
         return soup
 
-    if s is None: return None
+    if s is None:
+        return None
 
-    soup = bs4.BeautifulSoup(s, 'lxml')
+    soup = bs4.BeautifulSoup(s, "lxml")
     text = filterHtml(soup).get_text()
     text = text.strip()
 
-    if len(text) == 0: return None
-    if text.lower().startswith("#redirect"): return None
+    if len(text) == 0:
+        return None
+    if text.lower().startswith("#redirect"):
+        return None
 
     return text
 
-def chunk(s):
-    words = s.split(" ")
-    CHUNK_SIZE = 256
-    OVERLAP = 64
-    return [" ".join(words[i:i+CHUNK_SIZE]) for i in range(0, len(words), CHUNK_SIZE - OVERLAP)]
-
-# cases = [
-#     (
-#         "",
-#         [""],
-#     ),
-#     (
-#         "hi",
-#         ["hi"],
-#     ),
-#     (
-#         "this is a test",
-#         ["this is a test"],
-#     ),
-#     (
-#         "this is a long test with more than ten words so that we can test overlap",
-#         [
-#             "this is a long test with more than ten words",
-#             "ten words so that we can test overlap",
-#         ],
-#     ),
-# ]
-#
-#
-# for case in cases:
-#     print("Testing chunk function.")
-#     print("Input: %s" % str(case))
-#     outexp = case[1]
-#     outactual = chunk(case[0])
-#     assert outactual == outexp, "%s != %s" % (outactual, outexp)
-#
-postgres.init(embeddingLength=ml.embeddingLength())
+
+postgres.init(embeddingLength=models.embeddingLength())
 
 with postgres.get_connection().cursor() as cur:
     with open("dump.xml", "rb") as f:
         dump = mwxml.Dump.from_file(f)
 
         for page in tqdm.tqdm(dump.pages):
             title = page.title
-            if title is None: continue
+            if title is None:
+                continue
             if re.search("/[a-z][a-z][a-z]?(-[a-z]+)?$", title):
                 # print(f"skipping {title}")
                 continue
 
             # Delete existing page chunks, that is, update if we know about it already
-            cur.execute("DELETE FROM page_text WHERE title = %s;", (title,))
+            cur.execute("DELETE FROM chunks WHERE title = %s;", (title,))
+            cur.execute("DELETE FROM pages WHERE title = %s;", (title,))
 
             # We support only one revision in the dump
             text = list(page)[0].text
 
             text = strip_wikitext(text)
-            if text is None: continue
+            if text is None:
+                continue
 
-            for c in chunk(text):
-                embedding = ml.embeddingString(c)
-                cur.execute("INSERT INTO page_text (title, text, embedding) VALUES (%s, %s, %s);",
-                            (title, c, embedding))
+            cur.execute(
+                "INSERT INTO pages (title, text) VALUES (%s, %s);",
+                (title, text),
+            )
+
+            for c in chunker.chunk(text):
+                embedding = models.embeddingString(c)
+                cur.execute(
+                    "INSERT INTO chunks (title, text, embedding) VALUES (%s, %s, %s);",
+                    (title, c, embedding),
+                )
 
             # Commit the transaction
             postgres.get_connection().commit()
diff --git a/main.py b/main.py
@@ -1,10 +1,8 @@
-import ml
+import models
 import postgres
 
-# print(ml.chat("What is the meaning of life?"))
-
 query = "event 2024 tallinn"
-emb = ml.embeddingString(query)
+emb = models.embeddingString(query)
 cur = postgres.get_connection().cursor()
 cur.execute("SELECT text FROM page_text ORDER BY embedding <-> %s LIMIT 5;", (emb,))
 res = cur.fetchall()
@@ -20,7 +18,7 @@
 print("Prompt: " + prompt + "\n\n\n")
 
 print("********************************************************************************")
-result = ml.chat(prompt)
+result = models.chat(prompt)
 print(result)
 
 print("********************************************************************************")
@@ -37,6 +35,6 @@
 """
 print("Prompt 2: " + prompt2)
 print("********************************************************************************")
-result2 = ml.chat(prompt2)
+result2 = models.chat(prompt2)
 print(result2)
 
diff --git a/models.py b/models.py
@@ -4,8 +4,8 @@
 
 embeddingsModel = "all-minilm"  # or: mxbai-embed-large
 # chatModel = "qwen:0.5b"
-chatModel = "gemma:7b"
-# chatModel = "mistral:v0.2"
+# chatModel = "gemma:7b"
+chatModel = "mistral:v0.2"
 
 
 def get_connection():
diff --git a/postgres.py b/postgres.py
@@ -1,4 +1,5 @@
 import psycopg
+from psycopg.sql import SQL, Literal
 
 _db: psycopg.Connection | None = None
 
@@ -13,6 +14,9 @@ def get_connection() -> psycopg.Connection:
 
 
 def init(embeddingLength: int):
+    if not isinstance(embeddingLength, int) or embeddingLength <= 0:
+        raise ValueError("Invalid embedding length")
+
     db = get_connection()
     cur = db.cursor()
 
@@ -23,14 +27,21 @@ def init(embeddingLength: int):
     db.commit()
 
     cur.execute(
-        """
-        CREATE TABLE IF NOT EXISTS page_text ( 
-            id SERIAL PRIMARY KEY, 
-            title VARCHAR(255) NOT NULL, 
-            text TEXT NOT NULL, 
-            embedding vector( %s ) NOT NULL
-        );
-        """,
-        (embeddingLength,),
+        SQL(
+            """
+            CREATE TABLE IF NOT EXISTS pages ( 
+                id SERIAL PRIMARY KEY, 
+                title VARCHAR(255) NOT NULL, 
+                text TEXT NOT NULL
+            );
+
+            CREATE TABLE IF NOT EXISTS chunks ( 
+                id SERIAL PRIMARY KEY, 
+                title VARCHAR(255) NOT NULL, 
+                text TEXT NOT NULL, 
+                embedding vector( {} ) NOT NULL
+            );
+            """
+        ).format(Literal(str(embeddingLength)))
     )
     db.commit()

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 .venv
 .direnv
 __pycache__
 +*.xml