-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfind-similar-papers.py
270 lines (199 loc) · 8.65 KB
/
find-similar-papers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import PyPDF2
import io
import json
import sys
import os
# Find all of the JATS papers in the current directory
def find_papers():
# Import the os library
import os
# Create an empty list to store the papers
papers = []
# Recursively loop through all of the directories in
# the current directory to find all of the JATS files
for root, dirs, files in os.walk("."):
# Loop through all of the files
for file in files:
# Check if the file is a PDF file and includes 'joss' in the name somewhere
if file.endswith(".pdf") and "joss" in file.lower():
# Create the path to the paper
path = os.path.join(root, file)
# Add the path to the list of papers
papers.append(path)
# Return the list of papers
return papers
# Import the os library
import os
# Require the OpenAI library
import openai as ai
# Require the embedding utilities
from openai.embeddings_utils import get_embedding, cosine_similarity
# Require the Pandas library
import pandas as pd
# Set the API key for OpenAI
ai.api_key = os.environ['OPENAI_API_KEY']
# List the available models and print them
models = ai.Model.list()
# Create embedding for the paper
def create_embedding(paper):
try:
# Create the prompt by reading the contents of the paper
pdf = open(paper, "rb").read()
reader = PyPDF2.PdfReader(io.BytesIO(pdf)) # Wrap the bytes object in a BytesIO object
text = ""
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text() # Use the pages attribute instead of getPage
# Limit the prompt size to 8191 tokens
text = text[:8191]
# Create the embedding using the text-embedding-ada-002 engine
embedding = ai.Embedding.create(engine = "text-embedding-3-large", input=text)
# Return the embedding
return embedding
except PyPDF2.errors.PdfReadError as e:
print(f"Error reading PDF file {paper}: {e}")
return None
# This function converts 10.21105.joss.00899.pdf to https://joss.theoj.org/10.21105/joss.00899.json
def convert_pdf_string(incoming):
# Split the incoming string on the period character
parts = incoming.split(".")
# Return the joined string
return f"https://joss.theoj.org/papers/{parts[0]}.{parts[1]}/{parts[2]}.{parts[3]}.json"
# Call the JOSS API and parse the JSON response
def call_joss_api(paper):
# Import the requests library
import requests
# Create the URL for the JOSS API
url = convert_pdf_string(paper)
# Call the JOSS API
response = requests.get(url)
# Check if the response was successful
if response.status_code == 200:
# Return the JSON response
return response.json()
else:
# Return None
return None
# Extract reviewer array from the JSON response and join fields as string
def extract_reviewers(response):
# Check if the response is None
if response is None:
# Return None
return None
# Create an empty list to store the reviewers
reviewers = []
# Loop through the reviewers in the response
for reviewer in response['reviewers']:
# Add the reviewer to the list of reviewers
reviewers.append(reviewer)
# Return the reviewers as a comma-separated string
return ", ".join(reviewers)
# Extract the review url from the JSON response
def extract_review_url(response):
# Check if the response is None
if response is None:
# Return None
return None
# Return the review url
return response['paper_review']
# Extract the paper title from the JSON response
def extract_title(response):
# Check if the response is None
if response is None:
# Return None
return None
# Return the paper title
return response['title']
def extract_author(response):
# Check if the response is None
if response is None:
# Return None
return None
# Return the paper title
return response['submitting_author']
def extract_editor(response):
# Check if the response is None
if response is None:
# Return None
return None
# Return the paper title
return response['editor']
def extract_editor_status(response):
# Check if the response is None
if response is None:
# Return None
return None
# Return the paper title
return response['editor_status']
# Takes a string like ./joss.04738/10.21105.joss.04738.pdf
# first returns 10.21105.joss.04738.pdf
# then calls out to the JOSS API and returns the title
def title_from_paper(paper):
# Import the os library
import os
# Split the paper on the period character
filename = os.path.basename(paper)
response = call_joss_api(filename)
return extract_title(response)
# Print out a summary of the paper, together with the URL and reviewers.
def print_summary(paper):
# Call the JOSS API
response = call_joss_api(paper)
# Extract the reviewers from the response
reviewers = extract_reviewers(response)
# Extract the review url from the response
review_url = extract_review_url(response)
# Extract the title from the response
title = extract_title(response)
# Extract the submitting author from the response
author = extract_author(response)
# Extract the editor from the response
editor = extract_editor(response)
# Extract editor status from the response
editor_status = extract_editor_status(response)
# Print out the summary
return f"[{title}]({review_url})\nSubmitting author: ```{author}```\nHandling editor: ```{editor} ({editor_status})```\nReviewers: ```{reviewers}```"
# Run the script
if __name__ == "__main__":
# Get the GITHUB_WORKSPACE path
github_workspace = os.environ.get('GITHUB_WORKSPACE', '') # Use an empty string as a default if the environment variable is not set
embeddings_path = os.path.join(github_workspace,'papers-embeddings', 'embeddings.csv.gz')
# Read the embeddings from the embeddings.csv file
embeddings = pd.read_csv(embeddings_path, compression='gzip')
# Set the path to the paper_path environment variable from Actions
incoming_paper = os.environ['PDF_PATH']
# Create the embedding for the paper
incoming_embedding = create_embedding(incoming_paper)
# Create a Pandas dataframe to store the paper embeddings
only_embeddings = pd.DataFrame(columns = ["paper", "embedding"])
# Loop through the embeddings Pandas dataframe
# creating a new Pandas dataframe with just the paper and embedding
for index, row in embeddings.iterrows():
dict = json.loads(row.embedding)
embedding = dict['data'][0]['embedding']
only_embeddings = pd.concat([only_embeddings, pd.DataFrame([[row.paper, embedding]], columns = ["paper", "embedding"])])
# Find the top 5 most similar papers in the only_embeddings DataFrame using a cosine similarity
# Create a breakpoint here to inspect the only_embeddings DataFrame
# breakpoint()
target_embedding = incoming_embedding.data[0]['embedding']
only_embeddings['similarity'] = only_embeddings['embedding'].apply(lambda x: cosine_similarity(x, target_embedding))
# Sort the only_embeddings DataFrame by similarity in descending order
only_embeddings = only_embeddings.sort_values(by=['similarity'], ascending=False)
# Loop through the first 5 rows of the only_embeddings DataFrame
# skipping the first row as this will be the paper itself
# printing out the paper and similarity
# Construct the multi-line string
output_lines = ["**Five most similar historical JOSS papers:**\n"]
for index, row in only_embeddings[1:6].iterrows():
output_lines.append(print_summary(row.paper))
output_lines.append(f"Similarity score: {row.similarity:.4f}")
output_lines.append("") # for an extra newline
output_lines.append("")
output_lines.append("⚠️ **Note to editors:** If these papers look like they might be a good match, click through to the review issue for that paper and invite one or more of the authors *before* considering asking the reviewers of these papers to review again for JOSS.")
# Join the lines with actual newline characters
output_content = "\n".join(output_lines)
# Encode newlines for GitHub Actions
encoded_output = output_content.replace('\n', '%0A')
# Print the encoded string in the required GitHub Actions output format
# os.system(f"echo 'recommendations={encoded_output}' >> $GITHUB_OUTPUT")
with open(os.getenv('GITHUB_OUTPUT'), 'a') as f:
f.write(f"recommendations<<EOF\n{output_content}\nEOF\n")