Merge remote-tracking branch 'upstream/main'

DARPA-ASKEM · Jul 10, 2023 · 6784559 · 6784559
2 parents 37bab26 + 2a56307
commit 6784559
Show file tree

Hide file tree

Showing 63 changed files with 3,989 additions and 1,426 deletions.
diff --git a/.drone.yml b/.drone.yml
@@ -53,6 +53,24 @@ steps:
  - skema/model_assembly/**
  - data/gromet/**
 
+---
+kind: pipeline
+name: skema-py
+
+steps:
+- name: test_skema_py
+ image: python:3.8-bullseye
+ commands:
+ - apt-get update
+ - apt-get -y install build-essential graphviz libgraphviz-dev
+ - pip install ".[core,dev]"
+ - pytest --cov=skema skema/skema_py/tests
+ when:
+ paths:
+ - pyproject.toml
+ - .drone.yml
+ - skema/skema_py/**
+
 ---
 kind: pipeline
 name: skema-er
@@ -63,7 +81,7 @@ steps:
  commands:
  - apt-get update
  - apt-get -y install build-essential graphviz libgraphviz-dev
- - curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > skema/img2mml/trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
+ - curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
  - pip install ".[core,dev]"
  - pytest -s --cov=skema skema/img2mml/tests
  when:

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,30 @@
+name: SKEMA AWS
+on:
+ workflow_run:
+ workflows: ["SKEMA docker"]
+ branches: [main]
+ types: ["completed"]
+env:
+ SKEMA_ECS_CLUSTER_NAME_MAIN: ${{ secrets.SKEMA_ECS_CLUSTER_NAME_MAIN }}
+ SKEMA_ECS_SERVICE_NAME_MAIN: ${{ secrets.SKEMA_ECS_SERVICE_NAME_MAIN }}
+ SKEMA_AWS_REGION: ${{ secrets.SKEMA_AWS_REGION }}
+jobs:
+ deploy:
+ if: ${{ github.event.workflow_run.conclusion == 'success' }}
+ name: "deploy skema system to AWS"
+ runs-on: ubuntu-latest
+ steps:
+ # main deployment
+ - name: Configure AWS credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.SKEMA_AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.SKEMA_AWS_SECRET_ACCESS_KEY }}
+ aws-region: ${{ secrets.SKEMA_AWS_REGION }}
+ - name: Deploy
+ if: github.ref == 'refs/heads/main'
+ run: |
+ # update skema-py
+ aws ecs update-service --cluster $SKEMA_ECS_CLUSTER_NAME_MAIN --service darpa-askem-main-ecs-skema-py --force-new-deployment --region $SKEMA_AWS_REGION &>/dev/null
+ # update skema-rs
+ aws ecs update-service --cluster $SKEMA_ECS_CLUSTER_NAME_MAIN --service darpa-askem-main-ecs-skema-rs --force-new-deployment --region $SKEMA_AWS_REGION &>/dev/null
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -82,6 +82,8 @@ jobs:
  push: ${{ github.event_name != 'pull_request' }}
  # references `tags` step in steps for current job
  tags: ${{ steps.tags.outputs.tags }}
+ build-args: |
+ APP_VERSION=${{github.sha}}
 
  rust:
  name: "docker image for rust components"

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -1,4 +1,4 @@
-name: Project documentation
+name: Unit tests and project documentation
 
 on:
  push:
@@ -55,7 +55,7 @@ jobs:
  working-directory: .
  run: |
  # retrieve latest model for img2mml component
- curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > skema/img2mml/trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
+ curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
  pip install ".[all]"
 
  # docs (API)

diff --git a/Dockerfile.skema-py b/Dockerfile.skema-py
@@ -1,5 +1,4 @@
 # Dockerfile for the skema-py service
-
 FROM python:3.8-bullseye
 
 # ======================
@@ -23,6 +22,13 @@ apt-get install -y nodejs
 RUN apt-get clean &&\
  rm -rf /var/lib/apt/lists/*
 
+
+# =====================
+# ENV setup for app
+# =====================
+ARG APP_VERSION=unknown
+ENV APP_VERSION=$APP_VERSION
+
 # =====================
 # Setup the repository
 # =====================
@@ -38,12 +44,9 @@ ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Install the skema package
 RUN pip install wheel
-RUN pip install fastapi uvicorn
 RUN pip install six
 # Download ML model (~150MB)
-# FIXME: consider publishing and retrieving this model from HF Hub
-RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > skema/img2mml/trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
-# FIXME: remove later
+RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
 RUN tree /app
 #RUN pip install ".[all]"
 # exclude dependencies for docs

diff --git a/docs/dev/env.md b/docs/dev/env.md
@@ -8,6 +8,8 @@ conda create -n skema python=3.8 -c conda-forge rust=1.70.0 openjdk=11 sbt=1.9.0
 conda activate skema
 # fortran grammar for pa
 python skema/program_analysis/TS2CAST/build_tree_sitter_fortran.py
+# download the checkpoint for the img2mml service
+curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
 # mathjax deps for img2mml
 (cd skema/img2mml/data_generation && npm install)
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,9 +16,11 @@ dependencies=[
  "PyYAML",
  "tree-sitter",
  "requests",
- "fastapi",
- "uvicorn",
- "python-multipart"
+ "beautifulsoup4", # used to remove comments etc from pMML before sending to MORAE
+ "typing_extensions==4.5.0", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
+ "fastapi",
+ "uvicorn",
+ "python-multipart"
 ]
 # The Python program analysis pipeline does not currently work with Python 3.9
 # or 3.10. This may change in the future.
@@ -34,7 +36,7 @@ dynamic = ["readme"]
 # Pygraphviz is often tricky to install, so we reserve it for the dev extras
 # list.
 # - six: Required by auto-generated Swagger models
-dev = ["pytest", "pytest-cov", "pytest-xdist", "black", "mypy", "coverage", "pygraphviz", "six"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "httpx", "black", "mypy", "coverage", "pygraphviz", "six"]
 
 demo = ["jupyter==1.0.0"]
 

diff --git a/skema/img2mml/README.md b/skema/img2mml/README.md
@@ -3,21 +3,32 @@
 This directory contains the code for the img2mml service, which processes images
 of equations and returns presentation MathML corresponding to those equations.
 
-The model was developed by Gaurav Sharma and Clay Morrison, and this wrapper
-service was developed by Deepsana Shahi and Adarsh Pyarelal.
+The model was developed by Gaurav Sharma, Clay Morrison and Liang Zhang, and this wrapper
+service was developed by Deepsana Shahi, Adarsh Pyarelal and Liang Zhang.
 
 The model itself is not checked into the repository, but you can get it from
 here:
-https://kraken.sista.arizona.edu/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt.
+https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
 
 Place the model file in the `trained_models` directory.
 
 The curl command below should do the trick.
 
 ```
-curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_OMML-90K_best_model_RPimage.pt > trained_models/cnn_xfmer_OMML-90K_best_model_RPimage.pt
+curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
 ```
 
+If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above curl command that will replace the previous one.
+
+To update the model name or path, please make the following modifications to support updating the img2mml service and the corresponding Docker operations:
+
+1. Modify the paths of the model_path variable in the get_mathml_from_bytes function in the skema/img2mml/api.py file.
+2. Update the path settings in the "retrieve latest model for img2mml component" section of skema/.github/workflows/docs.yml.
+3. Adjust the curl command in the test_equation_reading section of skema/.drone.yml to download the checkpoint.
+4. Update the download checkpoint path in skema/img2mml/README.md.
+
+These changes will ensure that the necessary files and paths are updated correctly.
+
 Then, run the invocation below to launch the Dockerized service:
 
 ```

diff --git a/skema/img2mml/__init__.py b/skema/img2mml/__init__.py
diff --git a/skema/img2mml/api.py b/skema/img2mml/api.py
@@ -1,30 +1,58 @@
 import json
 import os
-from pathlib import Path
 import requests
-import re
+from pathlib import Path
+import urllib.request
+from skema.rest.proxies import SKEMA_MATHJAX_ADDRESS
 from skema.img2mml.translate import convert_to_torch_tensor, render_mml
 
 
-def get_mathml_from_bytes(data: bytes):
- # convert png image to tensor
- imagetensor = convert_to_torch_tensor(data)
+def retrieve_model(model_path=None):
+ """
+ Retrieve the img2mml model from the specified path or download it if not found.
 
- # change the shape of tensor from (C_in, H, W)
- # to (1, C_in, H, w) [batch =1]
- imagetensor = imagetensor.unsqueeze(0)
+ Args:
+ model_path (str, optional): Path to the img2mml model file. Defaults to None.
+
+ Returns:
+ str: Path to the loaded model file.
+ """
+ cwd = Path(__file__).parents[0]
+ MODEL_BASE_ADDRESS = "https://artifacts.askem.lum.ai/skema/img2mml/models"
+ MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
+
+ if model_path is None:
+ model_path = cwd / "trained_models" / MODEL_NAME
 
+ # Check if the model file already exists
+ if not os.path.exists(model_path):
+ # If the file doesn't exist, download it from the specified URL
+ url = f"{MODEL_BASE_ADDRESS}/{MODEL_NAME}"
+ print(f"Downloading the model checkpoint from {url}...")
+ urllib.request.urlretrieve(url, model_path)
+
+ return str(model_path)
+
+
+def get_mathml_from_bytes(data: bytes):
  # read config file
  cwd = Path(__file__).parents[0]
- config_path = cwd / "configs" / "ourmml_xfmer_config.json"
+ config_path = cwd / "configs" / "xfmer_mml_config.json"
  with open(config_path, "r") as cfg:
  config = json.load(cfg)
+ # convert png image to tensor
+ imagetensor = convert_to_torch_tensor(data, config)
+
+ # change the shape of tensor from (C_in, H, W)
+ # to (1, C_in, H, w) [batch =1]
+ imagetensor = imagetensor.unsqueeze(0)
+ VOCAB_NAME = "arxiv_im2mml_with_fonts_with_boldface_vocab.txt"
 
  # read vocab.txt
- with open(cwd / "vocab.txt") as f:
+ with open(cwd / "trained_models" / VOCAB_NAME) as f:
  vocab = f.readlines()
 
- model_path = cwd / "trained_models" / "cnn_xfmer_OMML-90K_best_model_RPimage.pt"
+ model_path = retrieve_model()
 
  return render_mml(config, model_path, vocab, imagetensor)
 
@@ -42,11 +70,8 @@ def get_mathml_from_latex(eqn: str) -> str:
  """Read a LaTeX equation string and convert it to presentation MathML"""
 
  # Define the webservice address from the MathJAX service
- protocol = os.environ.get('SKEMA_MATHJAX_PROTOCOL', 'http://')
- host = os.environ.get('SKEMA_MATHJAX_HOST', '127.0.0.1')
- port = str(os.environ.get('SKEMA_MATHJAX_PORT', 8031))
- webservice = protocol + host + ':' + port
- print('Connecting to ' + webservice)
+ webservice = SKEMA_MATHJAX_ADDRESS
+ print(f"Connecting to {webservice}")
 
  # Translate and save each LaTeX string using the NodeJS service for MathJax
  res = requests.post(

diff --git a/skema/img2mml/configs/xfmer_mml_config.json b/skema/img2mml/configs/xfmer_mml_config.json
@@ -0,0 +1,65 @@
+{
+ "model_type": "cnn_xfmer",
+ "//": "general params ",
+ "device": "cuda",
+ "use_single_gpu": true,
+ "gpu_id": 0,
+ "DDP": false,
+ "num_DDP_gpus": 2,
+ "DDP_gpu_ids": "0,1,2,3",
+ "DataParallel": false,
+ "DataParallel_gpu_ids": "0,1,2,3",
+ "num_cpus": 150,
+ "epochs": 200,
+ "seed": 42,
+ "load_trained_model_for_testing": false,
+ "continue_training_from_last_saved_model": false,
+ "//": "params for preprocessing",
+ "data_path": "training_data/",
+ "dataset_type": "sample_data",
+ "markup": "mml",
+ "preprocessed_image_width": 800,
+ "preprocessed_image_height": 100,
+ "padding": 8,
+ "resizing_factor": 0.5,
+ "max_input_hgt": 100,
+ "batch_size": 64,
+ "max_len": 350,
+ "vocab_freq": 5,
+ "shuffle": true,
+ "pin_memory": false,
+ "num_workers": 0,
+ "//": "optimizer params",
+ "optimizer_type": "Adam",
+ "momentum": 0.9,
+ "beta_1": 0.7,
+ "beta_2": 0.9,
+ "learning_rate": 0.00025999396352806,
+ "weight_decay": 0.0000174298222581897,
+ "use_scheduler": false,
+ "starting_lr": 0.01,
+ "step_size": 30,
+ "gamma": 0.1,
+ "dropout": 0.1,
+ "//": "encoder params",
+ "encoder_dim": 512,
+ "input_channels": 3,
+ "n_xfmer_heads": 4,
+ "n_xfmer_encoder_layers": 8,
+ "dim_feedforward_for_xfmer": 1024,
+ "//": "decoder params",
+ "embedding_dim": 256,
+ "decoder_hid_dim": 512,
+ "n_xfmer_decoder_layers": 4,
+ "//": "training/testing params",
+ "clip": 1,
+ "beam_search": false,
+ "beam_k": 5,
+ "beam_search_alpha": 0.6,
+ "min_length_bean_search_normalization": 3,
+ "early_stopping": true,
+ "early_stopping_counts": 20,
+ "garbage2pad": true,
+ "clean_state_dict": false,
+ "minimum_training_epochs": 50
+}
diff --git a/skema/img2mml/data_generation/mathjax_server.js b/skema/img2mml/data_generation/mathjax_server.js
@@ -88,6 +88,11 @@ app.get('/restart', function (req, res) {
  res.send("MathJax service restarted");
 });
 
+// healthcheck
+app.get('/healthcheck', function (req, res) {
+ res.sendStatus(200);
+});
+
 function port() {
  try {
  var port = process.env.SKEMA_MATHJAX_PORT