Skip to content
This repository was archived by the owner on Jan 8, 2025. It is now read-only.

Commit 22fe06d

Browse files
Merge branch 'ml4ai:main' into main
2 parents 5e5787f + 6cda598 commit 22fe06d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+845
-283
lines changed

.drone.yml

Lines changed: 0 additions & 91 deletions
This file was deleted.

.github/workflows/tests-and-docs.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,17 @@ jobs:
8181
working-directory: .
8282
run: |
8383
# retrieve latest model for img2mml component
84-
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
84+
pip install huggingface_hub
85+
python scripts/retrieve_model_ci.py
86+
87+
# Install askem
8588
pip install ".[all]"
89+
8690
# Install tree-sitter parser (for Python component unit tests)
8791
- name: Install tree-sitter parsers
8892
working-directory: .
8993
run: python skema/program_analysis/tree_sitter_parsers/build_parsers.py --ci --all
9094

91-
9295
# docs (API)
9396
# generate python docs using pdoc
9497
- name: "Create documentation for Python components (API docs)"

Dockerfile.skema-py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"
5454
RUN pip install wheel
5555
RUN pip install six
5656
# Download ML model (~150MB)
57-
RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
57+
RUN pip install huggingface_hub && python scripts/retrieve_model_ci.py
5858
RUN tree /app
5959
#RUN pip install ".[all]"
6060
# exclude dependencies for docs

docs/dev/env.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ conda activate skema
99
# Install tree-sitter parsers
1010
python skema/program_analysis/tree_sitter_parsers/build_parsers.py --all
1111
# download the checkpoint for the img2mml service
12-
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
12+
python scripts/retrieve_model.py
1313
# mathjax deps for img2mml
1414
(cd skema/img2mml/data_generation && npm install)
1515
```

pyproject.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,19 @@ dependencies=[
1515
"numpy",
1616
"dill==0.3.7",
1717
"networkx==2.8.8",
18-
"PyYAML",
18+
"PyYAML==6.*",
1919
"tree-sitter==0.20.4",
2020
"neo4j==5.14.1",
2121
"requests",
22-
"beautifulsoup4", # used to remove comments etc from pMML before sending to MORAE
22+
"beautifulsoup4==4.12.*", # used to remove comments etc from pMML before sending to MORAE
2323
"typing_extensions", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
2424
"fastapi~=0.100.0",
2525
"starlette",
2626
"httpx",
2727
"pydantic>=2.0.0",
2828
"uvicorn",
2929
"python-multipart",
30-
"func_timeout"
30+
"func_timeout==4.3.5"
3131
]
3232
# The Python program analysis pipeline does not currently work with Python 3.9
3333
# or 3.10. This may change in the future.
@@ -57,7 +57,7 @@ isa = [
5757
]
5858

5959
# shared ML dependencies
60-
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0"]
60+
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0", "huggingface_hub"]
6161

6262
# Im2MML dependencies. The img2mml service converts equation images to MathML.
6363
# See the skema/img2mml directory.
@@ -100,6 +100,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati
100100
"skema.rest" = "skema/rest"
101101
"skema.skema_py" = "skema/skema_py"
102102
"skema.utils" = "skema/utils"
103+
"skema.data" = "skema/data"
103104

104105
# re-map skema/text_reading/python to skema.text_reading
105106
#"skema.text_reading" = "skema/text_reading/python"
@@ -110,7 +111,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati
110111

111112
[tool.setuptools.package-data]
112113
# needed to ensure models are included in package/discoverable
113-
"*" = ["*.json", "vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml"]
114+
"*" = ["*.json", "*vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml", "*.zip"]
114115

115116
[tool.setuptools.dynamic]
116117
readme = {file = ["README.md"], content-type = "text/markdown"}

scripts/retrieve_model_ci.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import os
2+
from pathlib import Path
3+
4+
from huggingface_hub import hf_hub_download
5+
6+
def retrieve_model(model_path=None) -> str:
7+
"""
8+
Retrieve the img2mml model from the specified path or download it if not found.
9+
10+
Args:
11+
model_path (str, optional): Path to the img2mml model file. Defaults to None.
12+
13+
Returns:
14+
str: Path to the loaded model file.
15+
"""
16+
cwd = Path(__file__).parents[0]
17+
REPO_NAME = "lum-ai/img2mml"
18+
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
19+
# If the model path is none or doesn't exist, the default model will be downloaded from server.
20+
if model_path is None or not os.path.exists(model_path):
21+
model_path = cwd / "trained_models" / MODEL_NAME
22+
23+
# Check if the model file already exists
24+
if not os.path.exists(model_path):
25+
# If the file doesn't exist, download it from the specified URL
26+
print(f"Downloading the model checkpoint from HuggingFace...")
27+
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)
28+
29+
return str(model_path)
30+
31+
retrieve_model()
21.3 KB
Binary file not shown.
Binary file not shown.
343 KB
Binary file not shown.
66.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)