-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcore.py
237 lines (188 loc) · 9.57 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import os
import json
config = {}
if os.path.exists('config.json'):
with open('config.json', 'r') as conf:
config = json.load(conf)
else:
config['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
config['DATA_SOURCE_DIR'] = os.getenv('DATA_SOURCE_DIR')
config['DB_PATH'] = os.getenv('DB_PATH')
config["LOG_DIR"] = os.getenv('LOG_DIR')
# Const values
OPENAI_KEY = config['OPENAI_API_KEY']
DB_PATH = config["DB_PATH"]
DATA_SOURCE_DIR = config["DATA_SOURCE_DIR"]
LOG_DIR = config["LOG_DIR"]
#### INDEXING ####
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from database import create_db
embedding = OpenAIEmbeddings(api_key=OPENAI_KEY)
if os.path.exists(DB_PATH):
print(f"Database found at {DB_PATH}")
vectorstore = Chroma(persist_directory=DB_PATH, embedding_function=embedding)
else:
print(f"Constructing database at {DB_PATH}")
vectorstore = create_db(DB_PATH, DATA_SOURCE_DIR, embedding)
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 3, 'lambda_mult': 0.5 })
from langchain_openai import OpenAI, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, api_key=OPENAI_KEY)
# Post-processing
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
# -------------- EXECUTION ------------
def send_prompt_rag_plain(question: str, system_prompt: str):
# Prompt
# template = """Answer the question given following context:
template = system_prompt + """\nAnswer the question and base on the context if relevant.
Context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
# Chain
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
docs = retriever.get_relevant_documents(question)
return {
'response': rag_chain.invoke(question),
'context': format_docs(docs)
}
def send_prompt_llm(prompt: str):
# test llm
openAILM = ChatOpenAI(model="gpt-4o-mini", temperature=0, api_key=OPENAI_KEY)
messages = [
("system", "Answer the user prompt."),
("user", prompt),
]
return openAILM.invoke(messages).content
def get_follow_up_questions(question: str, answer: str):
openAILM = ChatOpenAI(model="gpt-4o-mini", temperature=0.3, api_key=OPENAI_KEY)
messages = [
("system", "Suggest three most relevant follow-up questions that the user might ask after the conversation below, with the output format being an array of strings. If the question is irrelevant to the Biblical context, respond with an empty array."),
("user", "Question: {0}\nAnswer: {1}".format(question, answer)),
]
return openAILM.invoke(messages).content
def extract_keywords(prompt: str):
messages = [
("system", """
Extract the keywords from the user prompt.
The response should be a json object with two fields - a list of keywords and a language code of the prompt as `language_code`.
Prompt:
"""),
("user", prompt)
]
response = llm.invoke(messages).content
response_obj = json.loads(response)
raw_keywords = response_obj["keywords"]
language = str(response_obj["language_code"])
keywords = [str(s).strip() for s in raw_keywords]
return keywords, language
# =========== EXPERIMENTAL =========
def prepend_docs(docs: list):
# adding random documents improves performance
random_docs = [
"Video game monetization is a type of process that a video game publisher can use to generate revenue from a video game product. The methods of monetization may vary between games, especially when they come from different genres or platforms, but they all serve the same purpose to return money to the game developers, copyright owners, and other stakeholders. As the monetization methods continue to diversify, they also affect the game design in a way that sometimes leads to criticism.",
"Capitalism is an economic system based on the private ownership of the means of production and their operation for profit. Central characteristics of capitalism include capital accumulation, competitive markets, price systems, private property, recognition of property rights, self-interest, economic freedom, meritocracy, work ethic, consumer sovereignty, profit motive, entrepreneurship, commodification, voluntary exchange, wage labor, production of commodities and services, and focus on economic growth.[6][7][8][9][10][11] In a market economy, decision-making and investments are determined by owners of wealth, property, or ability to maneuver capital or production ability in capital and financial markets—whereas prices and the distribution of goods and services are mainly determined by competition in goods and services markets.[12]",
]
docs_as_strings = random_docs + list(doc.page_content for doc in docs)
return "\n\n".join(docs_as_strings)
def send_prompt_experimental(question: str, system_prompt: str):
template = system_prompt + """Use the context if relevant to help formatting your answer. If the question is irrelevant to Biblical context and Bible translation, you can refuse to answer.
Context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
# Chain
rag_chain = (
{"context": retriever | prepend_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
answer = rag_chain.invoke(question)
context = format_docs(retriever.get_relevant_documents(question))
eval = eval_statement_of_faith(question, answer)
if eval["hasConflict"]:
answer = eval["suggested-answer"]
context = ""
return {
'response': answer,
'context': context
}
def send_rag_chat(user_prompt: str, last_response: str):
template = """
You are an evangelical Christian with traditional beliefs about God and the Bible. However, do not preface your responses with your persona.
Use the context if relevant to help formatting your answer. If the question is irrelevant to Biblical context and Bible translation, simply refuse to answer.
Context:
{context}
Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(template)
# define context based on whether or not the question is about the last response
if is_chat_history_question(user_prompt, last_response):
context = lambda _: last_response
else:
context = retriever | prepend_docs
# Chain
rag_chain = (
{"context": context, "question": RunnablePassthrough() }
| rag_prompt
| llm
| StrOutputParser()
)
answer = rag_chain.invoke(user_prompt)
eval = eval_statement_of_faith(user_prompt, answer)
if eval["hasConflict"]:
answer = eval["suggested-answer"]
return answer
def eval_statement_of_faith(question: str, answer: str):
# response = send_prompt_experimental(question, system_prompt)
system_prompt = """
You believe in the following Statement of Faith and you are an Evangelical Christian.
1. We consider Essential beliefs to be those that define us as believers in Jesus Christ.
2. The Bible is divinely inspired by God and has final authority.
3. God is one and exists in three persons: God the Father, God the Son and God the Holy Spirit.
4. Because of the fall of man, all humans are sinful, and in need of salvation.
5. The death of Christ is a substitute for sinners and provides for the cleansing of those who believe.
6. By God's grace, through faith, man receives salvation as a free gift because of Jesus' death and resurrection.
7. The resurrection of all—the saved to eternal life and the lost to eternal punishment.
Evaluate the question and answer below. If the answer disagrees with the statement of faith, respond true and refine the answer to align with our statement of faith.
Otherwise, respond false. The response should be a json object with two fields: "hasConflict" and "suggested-answer"
"""
query = f"Question: {question}\nAnswer: {answer}"
messages = [
("system", system_prompt),
("user", query),
]
response = llm.invoke(messages).content
response_json = json.loads(response)
return response_json
def summarize(content: str) -> str:
summary_agent = ChatOpenAI(temperature=0.3, api_key=OPENAI_KEY)
return summary_agent.invoke(f"Capture the most important points from the following paragraph and concisely format your response as bullet points: \n{content}").content
def is_chat_history_question(question: str, prev_response: str):
detect_agent = ChatOpenAI(temperature=0, api_key=OPENAI_KEY)
p = f'Evaluate the question below, response "True" if it is a question about the previous response. Otherwise, response "False" if it is a new question: \n```Previous response:\n {prev_response}```\n```Question: {question}```'
res = detect_agent.invoke(p).content
return res == "True"
### TRANSCRIPTION
from openai import OpenAI
client = OpenAI(api_key=OPENAI_KEY)
def transcribe(file: str):
with open(file, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
return transcription.text