Skip to content

Commit 0861db2

Browse files
committed
Initial code
1 parent 516f9d3 commit 0861db2

File tree

8 files changed

+453
-0
lines changed

8 files changed

+453
-0
lines changed

.github/workflows/main.yml

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Paperstack
2+
3+
on:
4+
schedule:
5+
- cron: '0 * * * *'
6+
workflow_dispatch:
7+
inputs:
8+
search-arxiv:
9+
description: 'Search Arxiv?'
10+
required: false
11+
default: 'false'
12+
search-scholar:
13+
description: 'Search Semantic Scholar?'
14+
required: false
15+
default: 'false'
16+
17+
jobs:
18+
run-script:
19+
runs-on: ubuntu-latest
20+
21+
steps:
22+
- name: Check out code
23+
uses: actions/checkout@v2
24+
25+
- name: Set up Python
26+
uses: actions/setup-python@v2
27+
with:
28+
python-version: '3.x'
29+
30+
- name: Install dependencies
31+
run: |
32+
python -m pip install --upgrade pip
33+
pip install -r requirements.txt
34+
35+
- name: Run script
36+
run: |
37+
python paperstack.py \
38+
${{ github.event.inputs.search-arxiv == 'true' && '--search-arxiv' || '' }} \
39+
${{ github.event.inputs.search-scholar == 'true' && '--search-semantic-scholar' || '' }}
40+
env:
41+
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
42+
NOTION_DATABASE_ID: ${{ secrets.NOTION_DATABASE_ID }}
43+
OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }}

_types.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from dataclasses import dataclass
2+
from datetime import datetime
3+
from enum import Enum
4+
5+
class Focus(str, Enum):
6+
Offensive = "Offensive"
7+
Defensive = "Defensive"
8+
Adversarial = "Adversarial"
9+
Safety = "Safety"
10+
Other = "Other"
11+
12+
@dataclass
13+
class Paper:
14+
page_id: str | None
15+
title: str | None
16+
url: str | None
17+
arxiv_id: str | None
18+
focus: Focus | None
19+
summary: str | None
20+
abstract: str | None
21+
authors: list[str]
22+
published: datetime | None

arxiv_utils.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import re
2+
3+
import arxiv # type: ignore
4+
5+
from _types import Paper
6+
7+
client = arxiv.Client()
8+
9+
10+
def convert_arxiv_url_to_id(url: str) -> str | None:
11+
match = re.search(r"\d{4}\.\d{5}", url)
12+
return match.group(0) if match else None
13+
14+
15+
def arxiv_result_to_paper(result: arxiv.Result) -> Paper:
16+
return Paper(
17+
page_id=None,
18+
title=result.title,
19+
url=result.entry_id,
20+
arxiv_id=convert_arxiv_url_to_id(result.entry_id),
21+
focus=None,
22+
summary=None,
23+
abstract=result.summary,
24+
authors=[a.name for a in result.authors],
25+
published=result.published,
26+
)
27+
28+
29+
def search_arxiv(query: str, max_results=10) -> list[arxiv.Result]:
30+
return list(
31+
client.results(
32+
arxiv.Search(
33+
query,
34+
max_results=max_results,
35+
sort_by=arxiv.SortCriterion.SubmittedDate,
36+
)
37+
)
38+
)
39+
40+
41+
def search_arxiv_as_paper(query: str, max_results=10) -> list[Paper]:
42+
return [
43+
arxiv_result_to_paper(result) for result in search_arxiv(query, max_results)
44+
]
45+
46+
47+
def search_arxiv_by_id(id: str) -> arxiv.Result | None:
48+
for result in client.results(arxiv.Search(id_list=[id])):
49+
return result
50+
return None
51+
52+
53+
def fill_papers_with_arxiv(papers: list[Paper]) -> list[Paper]:
54+
for paper in papers:
55+
if paper.published:
56+
continue
57+
58+
result: arxiv.Result | None = None
59+
60+
if paper.url:
61+
paper.arxiv_id = convert_arxiv_url_to_id(paper.url)
62+
63+
if paper.arxiv_id:
64+
result = search_arxiv_by_id(paper.arxiv_id)
65+
66+
if paper.title and not result:
67+
searched = search_arxiv(paper.title, max_results=1)
68+
result = searched[0] if searched else None
69+
70+
if not result:
71+
print(f'[!] Could not find arxiv result for "{paper.title}" [{paper.url}]')
72+
continue
73+
74+
paper.title = result.title
75+
paper.url = result.entry_id
76+
paper.arxiv_id = convert_arxiv_url_to_id(result.entry_id)
77+
paper.abstract = result.summary
78+
paper.authors = [a.name for a in result.authors]
79+
paper.published = result.published
80+
81+
return papers

notion_utils.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import typing as t
2+
from datetime import datetime
3+
4+
from notion_client import Client
5+
from notion_client.helpers import collect_paginated_api
6+
7+
from _types import Paper, Focus
8+
9+
NotionClient = Client
10+
11+
12+
def get_notion_client(token: str) -> NotionClient:
13+
return NotionClient(auth=token)
14+
15+
16+
def get_papers_from_notion(client: NotionClient, database_id: str) -> list[Paper]:
17+
results = collect_paginated_api(client.databases.query, database_id=database_id)
18+
19+
papers: list[Paper] = []
20+
for result in results:
21+
page_id = result["id"]
22+
properties = result["properties"]
23+
24+
title = properties["Title"]["title"]
25+
title = title[0]["text"]["content"] if title else None
26+
url = properties["URL"]["url"]
27+
summary = properties["Summary"]["rich_text"]
28+
summary = summary[0]["text"]["content"] if summary else None
29+
authors = [author["name"] for author in properties["Authors"]["multi_select"]]
30+
published = properties["Published"]["date"]
31+
published = datetime.fromisoformat(published["start"]) if published else None
32+
focus = properties["Focus"]["select"]
33+
focus = Focus(focus["name"]) if focus else None
34+
35+
papers.append(
36+
Paper(
37+
page_id=page_id,
38+
title=title,
39+
url=url,
40+
arxiv_id=None,
41+
focus=focus,
42+
summary=summary,
43+
abstract=None,
44+
authors=authors,
45+
published=published,
46+
)
47+
)
48+
49+
return papers
50+
51+
52+
def write_papers_to_notion(
53+
client: NotionClient, database_id: str, papers: list[Paper]
54+
) -> None:
55+
for paper in papers:
56+
properties: dict[str, t.Any] = {}
57+
if paper.title:
58+
properties["Title"] = {"title": [{"text": {"content": paper.title}}]}
59+
if paper.url:
60+
properties["URL"] = {"url": paper.url}
61+
if paper.summary:
62+
properties["Summary"] = {
63+
"rich_text": [{"text": {"content": paper.summary}}]
64+
}
65+
if paper.authors:
66+
properties["Authors"] = {
67+
"multi_select": [{"name": author} for author in paper.authors]
68+
}
69+
if paper.published:
70+
properties["Published"] = {"date": {"start": paper.published.isoformat()}}
71+
if paper.focus:
72+
properties["Focus"] = {"select": {"name": paper.focus.value}}
73+
74+
if paper.page_id:
75+
client.pages.update(paper.page_id, properties=properties)
76+
else:
77+
client.pages.create(
78+
parent={"database_id": database_id}, properties=properties
79+
)

openai_utils.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from openai import OpenAI
2+
3+
from _types import Focus
4+
5+
OpenAIClient = OpenAI
6+
7+
SUMMARIZE_ABSTRACT_PROMPT = """\
8+
You will be provided with an abstract of a scientific paper. \
9+
Compress this abstract in 1-2 sentences. Use very concise language usable as \
10+
bullet points on a slide deck. Respond ONLY with your summary.
11+
"""
12+
13+
ASSIGN_LABEL_PROMPT = """\
14+
You will be provided with an abstract of a scientific paper. \
15+
Assess the most applicable focus label based on the target audience, \
16+
research focus, produced materials, and key outcomes.
17+
18+
{labels}
19+
20+
Respond with ONLY ONE of the labels above. Do not include anything else in your response.
21+
"""
22+
23+
def get_openai_client(token: str) -> OpenAIClient:
24+
return OpenAI(api_key=token)
25+
26+
27+
def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
28+
response = client.chat.completions.create(
29+
model="gpt-3.5-turbo",
30+
messages=[
31+
{"role": "system", "content": SUMMARIZE_ABSTRACT_PROMPT},
32+
{"role": "user", "content": f"{abstract}"},
33+
],
34+
temperature=0.5,
35+
max_tokens=100,
36+
)
37+
38+
return response.choices[0].message.content.strip() # type: ignore
39+
40+
def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus | None:
41+
system_prompt = ASSIGN_LABEL_PROMPT.format(
42+
labels="\n".join([f"- {f.value}" for f in Focus])
43+
)
44+
45+
response = client.chat.completions.create(
46+
model="gpt-3.5-turbo",
47+
messages=[
48+
{"role": "system", "content": system_prompt},
49+
{"role": "user", "content": f"{abstract}"},
50+
],
51+
temperature=0.5,
52+
max_tokens=10,
53+
)
54+
55+
content = response.choices[0].message.content.strip() # type: ignore
56+
if content not in [f.value for f in Focus]:
57+
return None
58+
59+
return Focus(content)

paperstack.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import argparse
2+
import os
3+
4+
from notion_utils import (
5+
get_notion_client,
6+
get_papers_from_notion,
7+
write_papers_to_notion,
8+
)
9+
from arxiv_utils import fill_papers_with_arxiv, search_arxiv_as_paper
10+
from openai_utils import (
11+
get_focus_label_from_abstract,
12+
get_openai_client,
13+
summarize_abstract_with_openai,
14+
)
15+
from scholar_utils import get_recommended_arxiv_ids_from_semantic_scholar
16+
17+
ARXIV_SEARCH = """\
18+
"adversarial attacks" OR "language model attacks" OR "LLM vulnerabilities" OR \
19+
"AI security" OR "machine learning security" OR "jailbreak" OR "bypassing AI"\
20+
"""
21+
22+
23+
def main():
24+
parser = argparse.ArgumentParser()
25+
26+
parser.add_argument(
27+
"--notion-token",
28+
type=str,
29+
default=os.environ.get("NOTION_TOKEN"),
30+
help="Notion token",
31+
)
32+
parser.add_argument(
33+
"--database-id",
34+
type=str,
35+
default=os.environ.get("NOTION_DATABASE_ID"),
36+
help="Notion database id",
37+
)
38+
parser.add_argument(
39+
"--openai-token",
40+
type=str,
41+
default=os.environ.get("OPENAI_API_TOKEN"),
42+
help="OpenAI token",
43+
)
44+
parser.add_argument("--arxiv-search-query", type=str, default=ARXIV_SEARCH)
45+
parser.add_argument("--search-arxiv", action="store_true", default=False)
46+
parser.add_argument("--search-semantic-scholar", action="store_true", default=False)
47+
48+
args = parser.parse_args()
49+
50+
print("[+] Paperstack")
51+
52+
notion_client = get_notion_client(args.notion_token)
53+
openai_client = get_openai_client(args.openai_token)
54+
55+
# Get papers from Notion
56+
print(" |- Getting papers from Notion")
57+
papers = get_papers_from_notion(notion_client, args.database_id)
58+
59+
# Fill in missing data from arXiv
60+
print(" |- Filling in missing data from arXiv")
61+
papers = fill_papers_with_arxiv(papers)
62+
63+
if args.search_arxiv:
64+
# Search arXiv for new papers and deduplicate
65+
print(" |- Searching arXiv")
66+
existing_titles = [paper.title for paper in papers]
67+
for searched_paper in search_arxiv_as_paper(args.arxiv_search, max_results=5):
68+
if searched_paper.title not in existing_titles:
69+
print(f" |- {searched_paper.title[:50]}...")
70+
papers.append(searched_paper)
71+
72+
if args.search_semantic_scholar:
73+
print(" |- Getting related papers from Semantic Scholar")
74+
recommended_papers = get_recommended_arxiv_ids_from_semantic_scholar(papers)
75+
papers.extend(fill_papers_with_arxiv(recommended_papers))
76+
print(f" |- {len(recommended_papers)} new papers")
77+
78+
# Build summaries
79+
print(" |- Building summaries")
80+
for paper in papers:
81+
if not paper.summary and paper.abstract:
82+
print(f" |- {paper.title[:50]}...")
83+
paper.summary = summarize_abstract_with_openai(
84+
openai_client, paper.abstract
85+
)
86+
87+
# Assigning focus labels
88+
print(" |- Assigning focus labels")
89+
for paper in papers:
90+
if not paper.focus:
91+
paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract)
92+
print(f" |- {paper.focus}")
93+
94+
print(f" |- Writing back to Notion [{len(papers)}]...")
95+
write_papers_to_notion(notion_client, args.database_id, papers)
96+
97+
98+
if __name__ == "__main__":
99+
main()

requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
arxiv
2+
notion-client
3+
openai
4+
semanticscholar

0 commit comments

Comments
 (0)