From ead1eaca99345012e42186ab0de10841d8ef8797 Mon Sep 17 00:00:00 2001 From: "Mark E. Haase" Date: Tue, 6 Feb 2024 09:24:19 -0500 Subject: [PATCH] The tokenizer model is downloaded in the dockerfile to /tram/data, which is not a convenient location for developer environments. Move it to the relative path data/ml-models, which is where all the other model files are stored. --- Dockerfile | 20 ++++++++++---------- src/tram/ml/base.py | 4 +++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 62317364ca..2fddf231aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -64,15 +64,15 @@ RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/ # Handle custom CA certificate, if specified. RUN if test -n "${TRAM_CA_URL}" -a -n "${TRAM_CA_THUMBPRINT}" ; then \ - echo "Installing certificate authority from ${TRAM_CA_URL}" && \ - curl -sk "${TRAM_CA_URL}" -o /usr/local/share/ca-certificates/tram_ca.crt && \ - DOWNLOAD_CA_THUMBPRINT=$(openssl x509 -in /usr/local/share/ca-certificates/tram_ca.crt -fingerprint -noout | cut -d= -f2) && \ - if test "${DOWNLOAD_CA_THUMBPRINT}" = "${TRAM_CA_THUMBPRINT}"; then \ - update-ca-certificates; \ - else \ - printf "\n=====\nERROR\nExpected thumbprint: %s\nActual thumbprint: %s\n=====\n" "${TRAM_CA_THUMBPRINT}" "${DOWNLOAD_CA_THUMBPRINT}"; \ - exit 1; \ - fi; \ + echo "Installing certificate authority from ${TRAM_CA_URL}" && \ + curl -sk "${TRAM_CA_URL}" -o /usr/local/share/ca-certificates/tram_ca.crt && \ + DOWNLOAD_CA_THUMBPRINT=$(openssl x509 -in /usr/local/share/ca-certificates/tram_ca.crt -fingerprint -noout | cut -d= -f2) && \ + if test "${DOWNLOAD_CA_THUMBPRINT}" = "${TRAM_CA_THUMBPRINT}"; then \ + update-ca-certificates; \ + else \ + printf "\n=====\nERROR\nExpected thumbprint: %s\nActual thumbprint: %s\n=====\n" "${TRAM_CA_THUMBPRINT}" "${DOWNLOAD_CA_THUMBPRINT}"; \ + exit 1; \ + fi; \ fi ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt @@ -110,7 +110,7 @@ RUN --mount=type=cache,target=/root/.cache \ curl -kJL -o ${bert_data_dir}/${bert_config_localfile} $bert_config_url # run this command without cache volume mounted, so model is stored on image -RUN python3 -c "import os; import transformers; os.environ['CURL_CA_BUNDLE'] = ''; mdl = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased'); mdl.save_pretrained('/tram/data/priv-allenai-scibert-scivocab-uncased')" +RUN python3 -c "import os; import transformers; os.environ['CURL_CA_BUNDLE'] = ''; mdl = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased'); mdl.save_pretrained('/tram/data/ml-models/priv-allenai-scibert-scivocab-uncased')" # Generate and Run Django migrations scripts, collectstatic app files RUN tram makemigrations tram && \ diff --git a/src/tram/ml/base.py b/src/tram/ml/base.py index 3d5ddf6d9b..876cbe0f18 100644 --- a/src/tram/ml/base.py +++ b/src/tram/ml/base.py @@ -369,7 +369,9 @@ def predict_samples(self, samples: list[str]): to that technique. The sum of each row will always be 1. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - tokenizer = AutoTokenizer.from_pretrained("/tram/data/priv-allenai-scibert-scivocab-uncased") + tokenizer = AutoTokenizer.from_pretrained( + settings.ML_MODEL_DIR + "/priv-allenai-scibert-scivocab-uncased" + ) bert = ( BertForSequenceClassification.from_pretrained( settings.ML_MODEL_DIR + "/bert_model"