Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
YohannParis committed Jul 10, 2023
2 parents 37bab26 + 2a56307 commit 6784559
Show file tree
Hide file tree
Showing 63 changed files with 3,989 additions and 1,426 deletions.
20 changes: 19 additions & 1 deletion .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,24 @@ steps:
- skema/model_assembly/**
- data/gromet/**

---
kind: pipeline
name: skema-py

steps:
- name: test_skema_py
image: python:3.8-bullseye
commands:
- apt-get update
- apt-get -y install build-essential graphviz libgraphviz-dev
- pip install ".[core,dev]"
- pytest --cov=skema skema/skema_py/tests
when:
paths:
- pyproject.toml
- .drone.yml
- skema/skema_py/**

---
kind: pipeline
name: skema-er
Expand All @@ -63,7 +81,7 @@ steps:
commands:
- apt-get update
- apt-get -y install build-essential graphviz libgraphviz-dev
- curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > skema/img2mml/trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
- curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
- pip install ".[core,dev]"
- pytest -s --cov=skema skema/img2mml/tests
when:
Expand Down
30 changes: 30 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: SKEMA AWS
on:
workflow_run:
workflows: ["SKEMA docker"]
branches: [main]
types: ["completed"]
env:
SKEMA_ECS_CLUSTER_NAME_MAIN: ${{ secrets.SKEMA_ECS_CLUSTER_NAME_MAIN }}
SKEMA_ECS_SERVICE_NAME_MAIN: ${{ secrets.SKEMA_ECS_SERVICE_NAME_MAIN }}
SKEMA_AWS_REGION: ${{ secrets.SKEMA_AWS_REGION }}
jobs:
deploy:
if: ${{ github.event.workflow_run.conclusion == 'success' }}
name: "deploy skema system to AWS"
runs-on: ubuntu-latest
steps:
# main deployment
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.SKEMA_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.SKEMA_AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.SKEMA_AWS_REGION }}
- name: Deploy
if: github.ref == 'refs/heads/main'
run: |
# update skema-py
aws ecs update-service --cluster $SKEMA_ECS_CLUSTER_NAME_MAIN --service darpa-askem-main-ecs-skema-py --force-new-deployment --region $SKEMA_AWS_REGION &>/dev/null
# update skema-rs
aws ecs update-service --cluster $SKEMA_ECS_CLUSTER_NAME_MAIN --service darpa-askem-main-ecs-skema-rs --force-new-deployment --region $SKEMA_AWS_REGION &>/dev/null
2 changes: 2 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ jobs:
push: ${{ github.event_name != 'pull_request' }}
# references `tags` step in steps for current job
tags: ${{ steps.tags.outputs.tags }}
build-args: |
APP_VERSION=${{github.sha}}
rust:
name: "docker image for rust components"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Project documentation
name: Unit tests and project documentation

on:
push:
Expand Down Expand Up @@ -55,7 +55,7 @@ jobs:
working-directory: .
run: |
# retrieve latest model for img2mml component
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > skema/img2mml/trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
pip install ".[all]"
# docs (API)
Expand Down
13 changes: 8 additions & 5 deletions Dockerfile.skema-py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Dockerfile for the skema-py service

FROM python:3.8-bullseye

# ======================
Expand All @@ -23,6 +22,13 @@ apt-get install -y nodejs
RUN apt-get clean &&\
rm -rf /var/lib/apt/lists/*


# =====================
# ENV setup for app
# =====================
ARG APP_VERSION=unknown
ENV APP_VERSION=$APP_VERSION

# =====================
# Setup the repository
# =====================
Expand All @@ -38,12 +44,9 @@ ENV PATH="/root/.cargo/bin:${PATH}"

# Install the skema package
RUN pip install wheel
RUN pip install fastapi uvicorn
RUN pip install six
# Download ML model (~150MB)
# FIXME: consider publishing and retrieving this model from HF Hub
RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > skema/img2mml/trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
# FIXME: remove later
RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
RUN tree /app
#RUN pip install ".[all]"
# exclude dependencies for docs
Expand Down
2 changes: 2 additions & 0 deletions docs/dev/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ conda create -n skema python=3.8 -c conda-forge rust=1.70.0 openjdk=11 sbt=1.9.0
conda activate skema
# fortran grammar for pa
python skema/program_analysis/TS2CAST/build_tree_sitter_fortran.py
# download the checkpoint for the img2mml service
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
# mathjax deps for img2mml
(cd skema/img2mml/data_generation && npm install)
```
Expand Down
10 changes: 6 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ dependencies=[
"PyYAML",
"tree-sitter",
"requests",
"fastapi",
"uvicorn",
"python-multipart"
"beautifulsoup4", # used to remove comments etc from pMML before sending to MORAE
"typing_extensions==4.5.0", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
"fastapi",
"uvicorn",
"python-multipart"
]
# The Python program analysis pipeline does not currently work with Python 3.9
# or 3.10. This may change in the future.
Expand All @@ -34,7 +36,7 @@ dynamic = ["readme"]
# Pygraphviz is often tricky to install, so we reserve it for the dev extras
# list.
# - six: Required by auto-generated Swagger models
dev = ["pytest", "pytest-cov", "pytest-xdist", "black", "mypy", "coverage", "pygraphviz", "six"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "httpx", "black", "mypy", "coverage", "pygraphviz", "six"]

demo = ["jupyter==1.0.0"]

Expand Down
19 changes: 15 additions & 4 deletions skema/img2mml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,32 @@
This directory contains the code for the img2mml service, which processes images
of equations and returns presentation MathML corresponding to those equations.

The model was developed by Gaurav Sharma and Clay Morrison, and this wrapper
service was developed by Deepsana Shahi and Adarsh Pyarelal.
The model was developed by Gaurav Sharma, Clay Morrison and Liang Zhang, and this wrapper
service was developed by Deepsana Shahi, Adarsh Pyarelal and Liang Zhang.

The model itself is not checked into the repository, but you can get it from
here:
https://kraken.sista.arizona.edu/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt.
https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt

Place the model file in the `trained_models` directory.

The curl command below should do the trick.

```
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
```

If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above curl command that will replace the previous one.

To update the model name or path, please make the following modifications to support updating the img2mml service and the corresponding Docker operations:

1. Modify the paths of the model_path variable in the get_mathml_from_bytes function in the skema/img2mml/api.py file.
2. Update the path settings in the "retrieve latest model for img2mml component" section of skema/.github/workflows/docs.yml.
3. Adjust the curl command in the test_equation_reading section of skema/.drone.yml to download the checkpoint.
4. Update the download checkpoint path in skema/img2mml/README.md.

These changes will ensure that the necessary files and paths are updated correctly.

Then, run the invocation below to launch the Dockerized service:

```
Expand Down
Empty file added skema/img2mml/__init__.py
Empty file.
57 changes: 41 additions & 16 deletions skema/img2mml/api.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,58 @@
import json
import os
from pathlib import Path
import requests
import re
from pathlib import Path
import urllib.request
from skema.rest.proxies import SKEMA_MATHJAX_ADDRESS
from skema.img2mml.translate import convert_to_torch_tensor, render_mml


def get_mathml_from_bytes(data: bytes):
# convert png image to tensor
imagetensor = convert_to_torch_tensor(data)
def retrieve_model(model_path=None):
"""
Retrieve the img2mml model from the specified path or download it if not found.
# change the shape of tensor from (C_in, H, W)
# to (1, C_in, H, w) [batch =1]
imagetensor = imagetensor.unsqueeze(0)
Args:
model_path (str, optional): Path to the img2mml model file. Defaults to None.
Returns:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
MODEL_BASE_ADDRESS = "https://artifacts.askem.lum.ai/skema/img2mml/models"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"

if model_path is None:
model_path = cwd / "trained_models" / MODEL_NAME

# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
url = f"{MODEL_BASE_ADDRESS}/{MODEL_NAME}"
print(f"Downloading the model checkpoint from {url}...")
urllib.request.urlretrieve(url, model_path)

return str(model_path)


def get_mathml_from_bytes(data: bytes):
# read config file
cwd = Path(__file__).parents[0]
config_path = cwd / "configs" / "ourmml_xfmer_config.json"
config_path = cwd / "configs" / "xfmer_mml_config.json"
with open(config_path, "r") as cfg:
config = json.load(cfg)
# convert png image to tensor
imagetensor = convert_to_torch_tensor(data, config)

# change the shape of tensor from (C_in, H, W)
# to (1, C_in, H, w) [batch =1]
imagetensor = imagetensor.unsqueeze(0)
VOCAB_NAME = "arxiv_im2mml_with_fonts_with_boldface_vocab.txt"

# read vocab.txt
with open(cwd / "vocab.txt") as f:
with open(cwd / "trained_models" / VOCAB_NAME) as f:
vocab = f.readlines()

model_path = cwd / "trained_models" / "cnn_xfmer_OMML-90K_best_model_RPimage.pt"
model_path = retrieve_model()

return render_mml(config, model_path, vocab, imagetensor)

Expand All @@ -42,11 +70,8 @@ def get_mathml_from_latex(eqn: str) -> str:
"""Read a LaTeX equation string and convert it to presentation MathML"""

# Define the webservice address from the MathJAX service
protocol = os.environ.get('SKEMA_MATHJAX_PROTOCOL', 'http://')
host = os.environ.get('SKEMA_MATHJAX_HOST', '127.0.0.1')
port = str(os.environ.get('SKEMA_MATHJAX_PORT', 8031))
webservice = protocol + host + ':' + port
print('Connecting to ' + webservice)
webservice = SKEMA_MATHJAX_ADDRESS
print(f"Connecting to {webservice}")

# Translate and save each LaTeX string using the NodeJS service for MathJax
res = requests.post(
Expand Down
65 changes: 65 additions & 0 deletions skema/img2mml/configs/xfmer_mml_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"model_type": "cnn_xfmer",
"//": "general params ",
"device": "cuda",
"use_single_gpu": true,
"gpu_id": 0,
"DDP": false,
"num_DDP_gpus": 2,
"DDP_gpu_ids": "0,1,2,3",
"DataParallel": false,
"DataParallel_gpu_ids": "0,1,2,3",
"num_cpus": 150,
"epochs": 200,
"seed": 42,
"load_trained_model_for_testing": false,
"continue_training_from_last_saved_model": false,
"//": "params for preprocessing",
"data_path": "training_data/",
"dataset_type": "sample_data",
"markup": "mml",
"preprocessed_image_width": 800,
"preprocessed_image_height": 100,
"padding": 8,
"resizing_factor": 0.5,
"max_input_hgt": 100,
"batch_size": 64,
"max_len": 350,
"vocab_freq": 5,
"shuffle": true,
"pin_memory": false,
"num_workers": 0,
"//": "optimizer params",
"optimizer_type": "Adam",
"momentum": 0.9,
"beta_1": 0.7,
"beta_2": 0.9,
"learning_rate": 0.00025999396352806,
"weight_decay": 0.0000174298222581897,
"use_scheduler": false,
"starting_lr": 0.01,
"step_size": 30,
"gamma": 0.1,
"dropout": 0.1,
"//": "encoder params",
"encoder_dim": 512,
"input_channels": 3,
"n_xfmer_heads": 4,
"n_xfmer_encoder_layers": 8,
"dim_feedforward_for_xfmer": 1024,
"//": "decoder params",
"embedding_dim": 256,
"decoder_hid_dim": 512,
"n_xfmer_decoder_layers": 4,
"//": "training/testing params",
"clip": 1,
"beam_search": false,
"beam_k": 5,
"beam_search_alpha": 0.6,
"min_length_bean_search_normalization": 3,
"early_stopping": true,
"early_stopping_counts": 20,
"garbage2pad": true,
"clean_state_dict": false,
"minimum_training_epochs": 50
}
5 changes: 5 additions & 0 deletions skema/img2mml/data_generation/mathjax_server.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ app.get('/restart', function (req, res) {
res.send("MathJax service restarted");
});

// healthcheck
app.get('/healthcheck', function (req, res) {
res.sendStatus(200);
});

function port() {
try {
var port = process.env.SKEMA_MATHJAX_PORT
Expand Down
Loading

0 comments on commit 6784559

Please sign in to comment.