Initial code

monoxgas · monoxgas · commit 0861db2b8a49 · 2024-01-02T19:41:09.000-07:00
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,43 @@
+name: Paperstack
+
+on:
+  schedule:
+    - cron: '0 * * * *'
+  workflow_dispatch:
+    inputs:
+      search-arxiv:
+        description: 'Search Arxiv?'
+        required: false
+        default: 'false'
+      search-scholar:
+        description: 'Search Semantic Scholar?'
+        required: false
+        default: 'false'
+            
+jobs:
+  run-script:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+
+    - name: Run script
+      run: |
+        python paperstack.py \
+          ${{ github.event.inputs.search-arxiv == 'true' && '--search-arxiv' || '' }} \
+          ${{ github.event.inputs.search-scholar == 'true' && '--search-semantic-scholar' || '' }}
+      env:
+        NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
+        NOTION_DATABASE_ID: ${{ secrets.NOTION_DATABASE_ID }}
+        OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }}
diff --git a/_types.py b/_types.py
@@ -0,0 +1,22 @@
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+
+class Focus(str, Enum):
+    Offensive = "Offensive"
+    Defensive = "Defensive"
+    Adversarial = "Adversarial"
+    Safety = "Safety"
+    Other = "Other"
+
+@dataclass
+class Paper:
+    page_id: str | None
+    title: str | None
+    url: str | None
+    arxiv_id: str | None
+    focus: Focus | None
+    summary: str | None
+    abstract: str | None
+    authors: list[str]
+    published: datetime | None
diff --git a/arxiv_utils.py b/arxiv_utils.py
@@ -0,0 +1,81 @@
+import re
+
+import arxiv  # type: ignore
+
+from _types import Paper
+
+client = arxiv.Client()
+
+
+def convert_arxiv_url_to_id(url: str) -> str | None:
+    match = re.search(r"\d{4}\.\d{5}", url)
+    return match.group(0) if match else None
+
+
+def arxiv_result_to_paper(result: arxiv.Result) -> Paper:
+    return Paper(
+        page_id=None,
+        title=result.title,
+        url=result.entry_id,
+        arxiv_id=convert_arxiv_url_to_id(result.entry_id),
+        focus=None,
+        summary=None,
+        abstract=result.summary,
+        authors=[a.name for a in result.authors],
+        published=result.published,
+    )
+
+
+def search_arxiv(query: str, max_results=10) -> list[arxiv.Result]:
+    return list(
+        client.results(
+            arxiv.Search(
+                query,
+                max_results=max_results,
+                sort_by=arxiv.SortCriterion.SubmittedDate,
+            )
+        )
+    )
+
+
+def search_arxiv_as_paper(query: str, max_results=10) -> list[Paper]:
+    return [
+        arxiv_result_to_paper(result) for result in search_arxiv(query, max_results)
+    ]
+
+
+def search_arxiv_by_id(id: str) -> arxiv.Result | None:
+    for result in client.results(arxiv.Search(id_list=[id])):
+        return result
+    return None
+
+
+def fill_papers_with_arxiv(papers: list[Paper]) -> list[Paper]:
+    for paper in papers:
+        if paper.published:
+            continue
+
+        result: arxiv.Result | None = None
+
+        if paper.url:
+            paper.arxiv_id = convert_arxiv_url_to_id(paper.url)
+
+        if paper.arxiv_id:
+            result = search_arxiv_by_id(paper.arxiv_id)
+
+        if paper.title and not result:
+            searched = search_arxiv(paper.title, max_results=1)
+            result = searched[0] if searched else None
+
+        if not result:
+            print(f'[!] Could not find arxiv result for "{paper.title}" [{paper.url}]')
+            continue
+
+        paper.title = result.title
+        paper.url = result.entry_id
+        paper.arxiv_id = convert_arxiv_url_to_id(result.entry_id)
+        paper.abstract = result.summary
+        paper.authors = [a.name for a in result.authors]
+        paper.published = result.published
+
+    return papers
diff --git a/notion_utils.py b/notion_utils.py
@@ -0,0 +1,79 @@
+import typing as t
+from datetime import datetime
+
+from notion_client import Client
+from notion_client.helpers import collect_paginated_api
+
+from _types import Paper, Focus
+
+NotionClient = Client
+
+
+def get_notion_client(token: str) -> NotionClient:
+    return NotionClient(auth=token)
+
+
+def get_papers_from_notion(client: NotionClient, database_id: str) -> list[Paper]:
+    results = collect_paginated_api(client.databases.query, database_id=database_id)
+
+    papers: list[Paper] = []
+    for result in results:
+        page_id = result["id"]
+        properties = result["properties"]
+
+        title = properties["Title"]["title"]
+        title = title[0]["text"]["content"] if title else None
+        url = properties["URL"]["url"]
+        summary = properties["Summary"]["rich_text"]
+        summary = summary[0]["text"]["content"] if summary else None
+        authors = [author["name"] for author in properties["Authors"]["multi_select"]]
+        published = properties["Published"]["date"]
+        published = datetime.fromisoformat(published["start"]) if published else None
+        focus = properties["Focus"]["select"]
+        focus = Focus(focus["name"]) if focus else None
+
+        papers.append(
+            Paper(
+                page_id=page_id,
+                title=title,
+                url=url,
+                arxiv_id=None,
+                focus=focus,
+                summary=summary,
+                abstract=None,
+                authors=authors,
+                published=published,
+            )
+        )
+
+    return papers
+
+
+def write_papers_to_notion(
+    client: NotionClient, database_id: str, papers: list[Paper]
+) -> None:
+    for paper in papers:
+        properties: dict[str, t.Any] = {}
+        if paper.title:
+            properties["Title"] = {"title": [{"text": {"content": paper.title}}]}
+        if paper.url:
+            properties["URL"] = {"url": paper.url}
+        if paper.summary:
+            properties["Summary"] = {
+                "rich_text": [{"text": {"content": paper.summary}}]
+            }
+        if paper.authors:
+            properties["Authors"] = {
+                "multi_select": [{"name": author} for author in paper.authors]
+            }
+        if paper.published:
+            properties["Published"] = {"date": {"start": paper.published.isoformat()}}
+        if paper.focus:
+            properties["Focus"] = {"select": {"name": paper.focus.value}}
+
+        if paper.page_id:
+            client.pages.update(paper.page_id, properties=properties)
+        else:
+            client.pages.create(
+                parent={"database_id": database_id}, properties=properties
+            )
diff --git a/openai_utils.py b/openai_utils.py
@@ -0,0 +1,59 @@
+from openai import OpenAI
+
+from _types import Focus
+
+OpenAIClient = OpenAI
+
+SUMMARIZE_ABSTRACT_PROMPT = """\
+You will be provided with an abstract of a scientific paper. \
+Compress this abstract in 1-2 sentences. Use very concise language usable as \
+bullet points on a slide deck. Respond ONLY with your summary.
+"""
+
+ASSIGN_LABEL_PROMPT = """\
+You will be provided with an abstract of a scientific paper. \
+Assess the most applicable focus label based on the target audience, \
+research focus, produced materials, and key outcomes.
+
+{labels}
+
+Respond with ONLY ONE of the labels above. Do not include anything else in your response.
+"""
+
+def get_openai_client(token: str) -> OpenAIClient:
+    return OpenAI(api_key=token)
+
+
+def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": SUMMARIZE_ABSTRACT_PROMPT},
+            {"role": "user", "content": f"{abstract}"},
+        ],
+        temperature=0.5,
+        max_tokens=100,
+    )
+
+    return response.choices[0].message.content.strip() # type: ignore
+
+def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus | None:
+    system_prompt = ASSIGN_LABEL_PROMPT.format(
+        labels="\n".join([f"- {f.value}" for f in Focus])
+    )
+
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"{abstract}"},
+        ],
+        temperature=0.5,
+        max_tokens=10,
+    )
+
+    content = response.choices[0].message.content.strip() # type: ignore
+    if content not in [f.value for f in Focus]:
+        return None
+    
+    return Focus(content)
diff --git a/paperstack.py b/paperstack.py
@@ -0,0 +1,99 @@
+import argparse
+import os
+
+from notion_utils import (
+    get_notion_client,
+    get_papers_from_notion,
+    write_papers_to_notion,
+)
+from arxiv_utils import fill_papers_with_arxiv, search_arxiv_as_paper
+from openai_utils import (
+    get_focus_label_from_abstract,
+    get_openai_client,
+    summarize_abstract_with_openai,
+)
+from scholar_utils import get_recommended_arxiv_ids_from_semantic_scholar
+
+ARXIV_SEARCH = """\
+"adversarial attacks" OR "language model attacks" OR "LLM vulnerabilities" OR \
+"AI security" OR "machine learning security" OR "jailbreak" OR "bypassing AI"\
+"""
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--notion-token",
+        type=str,
+        default=os.environ.get("NOTION_TOKEN"),
+        help="Notion token",
+    )
+    parser.add_argument(
+        "--database-id",
+        type=str,
+        default=os.environ.get("NOTION_DATABASE_ID"),
+        help="Notion database id",
+    )
+    parser.add_argument(
+        "--openai-token",
+        type=str,
+        default=os.environ.get("OPENAI_API_TOKEN"),
+        help="OpenAI token",
+    )
+    parser.add_argument("--arxiv-search-query", type=str, default=ARXIV_SEARCH)
+    parser.add_argument("--search-arxiv", action="store_true", default=False)
+    parser.add_argument("--search-semantic-scholar", action="store_true", default=False)
+
+    args = parser.parse_args()
+
+    print("[+] Paperstack")
+
+    notion_client = get_notion_client(args.notion_token)
+    openai_client = get_openai_client(args.openai_token)
+
+    # Get papers from Notion
+    print(" |- Getting papers from Notion")
+    papers = get_papers_from_notion(notion_client, args.database_id)
+
+    # Fill in missing data from arXiv
+    print(" |- Filling in missing data from arXiv")
+    papers = fill_papers_with_arxiv(papers)
+
+    if args.search_arxiv:
+        # Search arXiv for new papers and deduplicate
+        print(" |- Searching arXiv")
+        existing_titles = [paper.title for paper in papers]
+        for searched_paper in search_arxiv_as_paper(args.arxiv_search, max_results=5):
+            if searched_paper.title not in existing_titles:
+                print(f"    |- {searched_paper.title[:50]}...")
+                papers.append(searched_paper)
+
+    if args.search_semantic_scholar:
+        print(" |- Getting related papers from Semantic Scholar")
+        recommended_papers = get_recommended_arxiv_ids_from_semantic_scholar(papers)
+        papers.extend(fill_papers_with_arxiv(recommended_papers))
+        print(f"    |- {len(recommended_papers)} new papers")
+
+    # Build summaries
+    print(" |- Building summaries")
+    for paper in papers:
+        if not paper.summary and paper.abstract:
+            print(f"    |- {paper.title[:50]}...")
+            paper.summary = summarize_abstract_with_openai(
+                openai_client, paper.abstract
+            )
+
+    # Assigning focus labels
+    print(" |- Assigning focus labels")
+    for paper in papers:
+        if not paper.focus:
+            paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract)
+            print(f"    |- {paper.focus}")
+
+    print(f" |- Writing back to Notion [{len(papers)}]...")
+    write_papers_to_notion(notion_client, args.database_id, papers)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+arxiv
+notion-client
+openai
+semanticscholar
diff --git a/scholar_utils.py b/scholar_utils.py