-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingestion_tools.py
151 lines (125 loc) · 5.04 KB
/
ingestion_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# TODO: abstract all of this into a function that takes in a PDF file name
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
from llama_index.core.node_parser import SentenceSplitter, JSONNodeParser
from llama_index.core.indices.struct_store import JSONQueryEngine
from llama_index.core.tools import FunctionTool, QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from typing import List, Optional
import re
def enforce_fnc_name(tool_name):
# openAI has a specific format for tool names that must be enforced
# Check if function_name matches the pattern
pattern = r'[^a-zA-Z0-9_-]'
return re.sub(pattern, '', tool_name)
def get_doc_tools(
file_path: str,
name: str,
) -> str:
"""Get vector query and summary query tools from a document."""
# load documents
documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes)
def vector_query(
query: str,
page_numbers: Optional[List[str]] = None
) -> str:
"""Use to answer questions over a given paper.
Useful if you have specific questions over the paper.
Always leave page_numbers as None UNLESS there is a specific page you want to search for.
Args:
query (str): the string query to be embedded.
page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE
if we want to perform a vector search
over all pages. Otherwise, filter by the set of specified pages.
"""
page_numbers = page_numbers or []
metadata_dicts = [
{"key": "page_label", "value": p} for p in page_numbers
]
query_engine = vector_index.as_query_engine(
similarity_top_k=2,
filters=MetadataFilters.from_dicts(
metadata_dicts,
condition=FilterCondition.OR
)
)
response = query_engine.query(query)
return response
vector_query_tool = FunctionTool.from_defaults(
name=f"vector_tool_{name}",
fn=vector_query
)
summary_index = SummaryIndex(nodes)
summary_query_engine = summary_index.as_query_engine(
response_mode="tree_summarize",
use_async=True,
)
summary_tool = QueryEngineTool.from_defaults(
name=f"summary_tool_{name}",
query_engine=summary_query_engine,
description=(
f"Useful for summarization questions related to {name}"
),
)
return vector_query_tool, summary_tool
def get_json_tools(
json_fnc_schema, json_changes_schema,
json_fnc_values, json_changes_values,
input_file):
# # load the document and text information
# documents = JSONReader(
# levels_back=0
# ).load_data(input_file=input_file)
# print(documents)
# splitter = SentenceSplitter(chunk_size=1024)
# nodes = splitter.get_nodes_from_documents(documents)
# # index component
# vector_index = VectorStoreIndex(nodes)
# summary_index = SummaryIndex(nodes)
# # create the query engines
# summary_query_engine = summary_index.as_query_engine(
# response_mode="tree_summarize",
# use_async=True,
# )
# vector_query_engine = vector_index.as_query_engine()
function_name = enforce_fnc_name(json_fnc_values["func_name"])
json_fnc_query_engine = JSONQueryEngine(
json_value=json_fnc_values,
json_schema=json_fnc_schema,
function_name=f"{function_name}_details_fnc"
)
json_changes_query_engine = JSONQueryEngine(
json_value=json_changes_values,
json_schema=json_changes_schema,
function_name=f"{function_name}_changes_fnc",
)
# # transform the engine into tools
# summary_tool = QueryEngineTool.from_defaults(
# query_engine=summary_query_engine,
# description=(
# "Useful for summarization questions about the vulnerability patch"
# ),
# )
# vector_tool = QueryEngineTool.from_defaults(
# query_engine=vector_query_engine,
# description=(
# "Useful for retrieving specific context about the vulnerability patch."
# ),
# )
json_fnc_tool = QueryEngineTool.from_defaults(
query_engine=json_fnc_query_engine,
name=f"{function_name}_details_tool",
description=(
f"Useful for retrieving specific information about the function before and after the vulnerability patch for function {function_name}"
)
)
json_changes_tool = QueryEngineTool.from_defaults(
query_engine=json_changes_query_engine,
name=f"{function_name}_changes_tool",
description=(
f"Useful for retrieving specific information about the line changes of the vulnerability patch for function {function_name}"
)
)
return json_fnc_tool, json_changes_tool