Skip to content

Commit

Permalink
[data] Refactors skema to fetch artifacts locally instead of from art…
Browse files Browse the repository at this point in the history
…ifacts.askem.lum.ai (#881)

## Summary of Changes
Refactors skema codebase to fetch data artifacts locally instead of from
artifacts.askem.lum.ai.

### Changes
- skema.data submodule created to store local artifacts
- model zip archives now stored locally under skema/data
- img2mml model is stored in a huggingface repository and is downloaded
at runtime.
-  huggingface_hub added as a dependency
- `deploy.yml` workflow removed 
- Old `.drone.yml` CI file removed
- Updated documentation to account for these changes
  • Loading branch information
vincentraymond-ua authored Apr 23, 2024
1 parent 482b768 commit 6cda598
Show file tree
Hide file tree
Showing 34 changed files with 95 additions and 179 deletions.
91 changes: 0 additions & 91 deletions .drone.yml

This file was deleted.

30 changes: 0 additions & 30 deletions .github/workflows/deploy.yml

This file was deleted.

5 changes: 4 additions & 1 deletion .github/workflows/tests-and-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ jobs:
working-directory: .
run: |
# retrieve latest model for img2mml component
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
pip install huggingface_hub
python scripts/retrieve_model_ci.py
# Install askem
pip install ".[all]"
# Install tree-sitter parser (for Python component unit tests)
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.skema-py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip install wheel
RUN pip install six
# Download ML model (~150MB)
RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
RUN pip install huggingface_hub && python scripts/retrieve_model_ci.py
RUN tree /app
#RUN pip install ".[all]"
# exclude dependencies for docs
Expand Down
2 changes: 1 addition & 1 deletion docs/dev/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ conda activate skema
# Install tree-sitter parsers
python skema/program_analysis/tree_sitter_parsers/build_parsers.py --all
# download the checkpoint for the img2mml service
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
python scripts/retrieve_model.py
# mathjax deps for img2mml
(cd skema/img2mml/data_generation && npm install)
```
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ isa = [
]

# shared ML dependencies
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0"]
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0", "huggingface_hub"]

# Im2MML dependencies. The img2mml service converts equation images to MathML.
# See the skema/img2mml directory.
Expand Down Expand Up @@ -100,6 +100,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati
"skema.rest" = "skema/rest"
"skema.skema_py" = "skema/skema_py"
"skema.utils" = "skema/utils"
"skema.data" = "skema/data"

# re-map skema/text_reading/python to skema.text_reading
#"skema.text_reading" = "skema/text_reading/python"
Expand All @@ -110,7 +111,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati

[tool.setuptools.package-data]
# needed to ensure models are included in package/discoverable
"*" = ["*.json", "*vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml"]
"*" = ["*.json", "*vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml", "*.zip"]

[tool.setuptools.dynamic]
readme = {file = ["README.md"], content-type = "text/markdown"}
Expand Down
31 changes: 31 additions & 0 deletions scripts/retrieve_model_ci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
from pathlib import Path

from huggingface_hub import hf_hub_download

def retrieve_model(model_path=None) -> str:
"""
Retrieve the img2mml model from the specified path or download it if not found.
Args:
model_path (str, optional): Path to the img2mml model file. Defaults to None.
Returns:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
REPO_NAME = "lum-ai/img2mml"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
# If the model path is none or doesn't exist, the default model will be downloaded from server.
if model_path is None or not os.path.exists(model_path):
model_path = cwd / "trained_models" / MODEL_NAME

# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
print(f"Downloading the model checkpoint from HuggingFace...")
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)

return str(model_path)

retrieve_model()
Binary file added skema/data/program_analysis/ABM-COVID-ABS.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-COmplexVID-19.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-Covasim.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-REINA.zip
Binary file not shown.
Binary file added skema/data/program_analysis/Bucky.zip
Binary file not shown.
Binary file added skema/data/program_analysis/CHIME-SIR-model.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Climlab.zip
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Generated-Halfar.zip
Binary file not shown.
Binary file added skema/data/program_analysis/MechBayes.zip
Binary file not shown.
Binary file added skema/data/program_analysis/SIDARTHE.zip
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Simple-SIR.zip
Binary file not shown.
Binary file added skema/data/program_analysis/TIE-GCM.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions skema/data/program_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pathlib import Path

MODEL_ZIP_ROOT_PATH = Path(__file__).parent
Binary file added skema/data/program_analysis/cism_v3.zip
Binary file not shown.
Binary file added skema/data/program_analysis/climlab-v2.zip
Binary file not shown.
Binary file added skema/data/program_analysis/code_sir.zip
Binary file not shown.
Binary file added skema/data/program_analysis/examples_python.zip
Binary file not shown.
11 changes: 5 additions & 6 deletions skema/img2mml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,23 @@ service was developed by Deepsana Shahi, Adarsh Pyarelal and Liang Zhang.

The model itself is not checked into the repository, but you can get it from
here:
https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
https://huggingface.co/lum-ai/img2mml/blob/main/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt

Place the model file in the `trained_models` directory.

The curl command below should do the trick.
The Python command below should do the trick.

```
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
python ../../scripts/retrieve_model_ci.py
```

If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above curl command that will replace the previous one.
If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above Python command that will replace the previous one.

To update the model name or path, please make the following modifications to support updating the img2mml service and the corresponding Docker operations:

1. Modify the ENV variable of `SKEMA_IMG2MML_MODEL_PATH`.
2. Update the path settings in the "retrieve latest model for img2mml component" section of `skema/.github/workflows/tests-and-docs.yml`.
3. Adjust the curl command in the test_equation_reading section of `skema/.drone.yml` to download the checkpoint.
4. Update the download checkpoint path in `skema/img2mml/README.md`.
3. Update the download checkpoint path in `skema/img2mml/README.md`.

These changes will ensure that the necessary files and paths are updated correctly.

Expand Down
10 changes: 5 additions & 5 deletions skema/img2mml/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from PIL import Image
from io import BytesIO

from huggingface_hub import hf_hub_download

def retrieve_model(model_path=None) -> str:
"""
Expand All @@ -25,7 +26,7 @@ def retrieve_model(model_path=None) -> str:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
MODEL_BASE_ADDRESS = "https://artifacts.askem.lum.ai/skema/img2mml/models"
REPO_NAME = "lum-ai/img2mml"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
# If the model path is none or doesn't exist, the default model will be downloaded from server.
if model_path is None or not os.path.exists(model_path):
Expand All @@ -34,10 +35,9 @@ def retrieve_model(model_path=None) -> str:
# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
url = f"{MODEL_BASE_ADDRESS}/{MODEL_NAME}"
print(f"Downloading the model checkpoint from {url}...")
urllib.request.urlretrieve(url, model_path)

print(f"Downloading the model checkpoint from HuggingFace...")
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)

return str(model_path)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
from skema.rest.utils import fn_preprocessor
from skema.rest.workflows import code_snippets_to_pn_amr
from skema.utils.fold import del_nulls, dictionary_to_gromet_json
from skema.data.program_analysis import MODEL_ZIP_ROOT_PATH
from skema.utils.change_dir_back import change_dir_back
from skema.skema_py.server import System


# Constants for file paths
THIS_PATH = Path(__file__).parent.resolve()
MODEL_YAML_PATH = THIS_PATH / "models.yaml"
Expand Down Expand Up @@ -149,11 +151,12 @@ def process_single_model(html: HTML_Instance, output_dir: str, model_name: str):
"""Generate an HTML report for a single model"""
html.add_model(model_name)

if model_name in MODEL_YAML:
model_url = MODEL_YAML[model_name]["zip_archive"]
response = requests.get(model_url)

zip = ZipFile(BytesIO(response.content))
if not model_name in MODEL_YAML:
return

model_path = MODEL_ZIP_ROOT_PATH.resolve() / MODEL_YAML[model_name]["zip_archive"]

zip = ZipFile(BytesIO(model_path.read_bytes()))
with TemporaryDirectory() as temp:
# We need to write all the files to the temporary directory before processing
# This is because some steps may require additional files, such as include directories in Fortran
Expand Down Expand Up @@ -297,7 +300,8 @@ def process_all_models(html: HTML_Instance, output_dir: str):
try:
supported, total = process_single_model(html, output_dir, model_name)
model_line_coverage[model_name] = (supported, total)
except:
except Exception as e:
print(e)
continue
return model_line_coverage

Expand Down
39 changes: 19 additions & 20 deletions skema/program_analysis/model_coverage_report/models.yaml
Original file line number Diff line number Diff line change
@@ -1,57 +1,56 @@
---
CHIME-penn-full:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-penn-full-model.zip"
zip_archive: "CHIME-penn-full-model.zip"

CHIME-SIR:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-SIR-model.zip"
zip_archive: "CHIME-SIR-model.zip"

CHIME-SVIIvR:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-SVIIvR-model.zip"
zip_archive: "CHIME-SVIIvR-model.zip"

ABM-COmplexVID-19:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-COmplexVID-19.zip"
zip_archive: "ABM-COmplexVID-19.zip"

ABM-COVID-ABS:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-COVID-ABS.zip"
zip_archive: "ABM-COVID-ABS.zip"

CHIME-penn-full:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-penn-full-model.zip"
zip_archive: "CHIME-penn-full-model.zip"

MechBayes:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/MechBayes.zip"
zip_archive: "MechBayes.zip"

SIDARTHE:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/SIDARTHE.zip"
zip_archive: "SIDARTHE.zip"

Simple-SIR:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Simple-SIR.zip"
zip_archive: "Simple-SIR.zip"

Climlab-v1:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Climlab.zip"
zip_archive: "Climlab.zip"

Climlab-v2:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/climlab-v2.zip"
zip_archive: "climlab-v2.zip"

Examples-Python:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/examples_python.zip"
zip_archive: "examples_python.zip"

Generated-Halfar:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Generated-Halfar.zip"
zip_archive: "Generated-Halfar.zip"

SV2AIR3-Waterloo-MATLAB:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/SV2AIR3-Waterloo-MATLAB.zip"
zip_archive: "SV2AIR3-Waterloo-MATLAB.zip"

Bucky:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Bucky.zip"
zip_archive: "Bucky.zip"

ABM-REINA:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-REINA.zip"
zip_archive: "ABM-REINA.zip"

Cornell-COVID19-sim-Frazier:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/Cornell-COVID19-sim-Frazier.zip"
zip_archive: "Cornell-COVID19-sim-Frazier.zip"

ABM-Covasim:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/ABM-Covasim.zip"
zip_archive: "ABM-Covasim.zip"

TIE-GCM:
zip_archive: "https://artifacts.askem.lum.ai/askem/data/models/zip-archives/TIE-GCM.zip"
zip_archive: "TIE-GCM.zip"
Loading

0 comments on commit 6cda598

Please sign in to comment.