-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpopulate.py
102 lines (90 loc) · 3.44 KB
/
populate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from pinecone import Pinecone
from openai import OpenAI
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from atlas import updateAtlas
import os
if not os.path.exists("keys.json"):
print("Please create and populate keys.json")
quit()
with open('keys.json', 'r') as f:
keys = json.load(f)
if not os.path.exists("vectorsWithMetadata.json"):
with open("vectorsWithMetadata.json", "w") as f:
print("Initializing vectorsWithMetadata.json")
f.write("{}")
with open('vectorsWithMetadata.json') as f:
data = json.load(f)
MODEL = "text-embedding-3-large"
client = OpenAI(api_key=keys["openai"])
pc = Pinecone(api_key=keys["pinecone"])
index = pc.Index("openai")
URL = "https://xkcd.com/info.0.json"
r = requests.get(URL)
newestID = r.json()["num"]
vectorIDToAdd = index.describe_index_stats().total_vector_count + 1
changedSomething = False
repeatTries = 0
while (vectorIDToAdd <= newestID):
if vectorIDToAdd == 404:
vectorIDToAdd += 1
continue
res = requests.get(f"https://xkcd.com/{vectorIDToAdd}/info.0.json")
JSON = json.loads(res.text)
d1 = datetime(int(JSON["year"]), int(JSON["month"]), int(JSON["day"]))
d2 = datetime.today()
daysPassed = abs((d2 - d1).days)
if (daysPassed < 2):
dayText = "day" if daysPassed == 1 else "days"
print(f"comic no. {vectorIDToAdd} is only {daysPassed} {dayText} old, minimum 2")
break
try:
URL = f"https://www.explainxkcd.com/wiki/index.php/{vectorIDToAdd}"
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
title = "Title: " + JSON["title"]
titleText = "Title text: " + JSON["alt"]
text = title
transcript = soup.find(id="Transcript")
if (transcript):
currentSibling = transcript.parent.findNext("dl")
text += "\n" + "Transcript: "
if (currentSibling):
text += currentSibling.get_text()
while (currentSibling.name == "dl"):
text += "\n" + currentSibling.get_text()
currentSibling = currentSibling.next_sibling
text += "\n" + titleText
text += "\nExplanation: "
explanation = soup.select_one("#Explanation, #Explanations, #Brief_Explanation, #Eggsplanation")
currentSibling = explanation.parent.findNext("p")
while (currentSibling and str(currentSibling.name).find("h") == -1):
text += "\n" + currentSibling.get_text()
currentSibling = currentSibling.next_sibling
text = text[:1000]
print(text)
repeatTries = 0
except AttributeError:
if (repeatTries < 3):
repeatTries += 1
print("Failed to parse page, retrying...")
continue
else:
raise Exception("Failed to parse page")
res = client.embeddings.create(
input=text, model=MODEL
)
embed = res.data[0].embedding
index.upsert(vectors=[
{"id":str(vectorIDToAdd), "values":embed}
])
changedSomething = True
data[str(vectorIDToAdd)] = {"id":str(vectorIDToAdd), "values":embed, "title":JSON["title"], "date":(JSON["day"] + "/" + JSON["month"] + "/" + JSON["year"])}
with open('vectorsWithMetadata.json', 'w') as file:
file.write(json.dumps(data))
vectorIDToAdd += 1
print(f"most recently cached is no. {index.describe_index_stats().total_vector_count}")
if (changedSomething):
updateAtlas()