From 9e3624d63d4732d9c11a5d666ea57ec2e87b37a8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 22 Dec 2022 10:19:12 +0000 Subject: [PATCH 1/4] CU-862hyd5wx Unify rosalind/vocab downloading in tests, identify and fail meaningfully in case of 503 --- tests/archive_tests/test_ner_archive.py | 11 ++++---- tests/helper.py | 34 +++++++++++++++++++++++++ tests/medmentions/make_cdb.py | 12 ++++----- tests/test_ner.py | 10 ++++---- tests/test_pipe.py | 10 ++++---- 5 files changed, 55 insertions(+), 22 deletions(-) diff --git a/tests/archive_tests/test_ner_archive.py b/tests/archive_tests/test_ner_archive.py index 439413073..4e44cb368 100644 --- a/tests/archive_tests/test_ner_archive.py +++ b/tests/archive_tests/test_ner_archive.py @@ -15,6 +15,8 @@ from medcat.linking.context_based_linker import Linker from medcat.config import Config +from ..helper import VocabDownloader + class NerArchiveTests(unittest.TestCase): @@ -35,12 +37,9 @@ def setUp(self) -> None: # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} - self.vocab_path = "./tmp_vocab.dat" - if not os.path.exists(self.vocab_path): - import requests - tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") - with open(self.vocab_path, 'wb') as f: - f.write(tmp.content) + downloader = VocabDownloader() + self.vocab_path = downloader.vocab_path + downloader.check_or_download() vocab = Vocab.load(self.vocab_path) # Make the pipeline diff --git a/tests/helper.py b/tests/helper.py index 1cb284ad7..1f0615ed1 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -1,6 +1,40 @@ +import os +import requests import unittest class AsyncMock(unittest.mock.MagicMock): async def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs) + + +ERROR_503 = b""" + +503 Service Unavailable + +

Service Unavailable

+

The server is temporarily unable to service your +request due to maintenance downtime or capacity +problems. Please try again later.

+ +""" + + +class VocabDownloader: + url = 'https://medcat.rosalind.kcl.ac.uk/media/vocab.dat' + vocab_path = "./tmp_vocab.dat" + + def check_or_download(self): + if not os.path.exists(self.vocab_path): + tmp = requests.get(self.url) + if tmp.content == ERROR_503: + raise AssertionError("Rosalind server not available!") + with open(self.vocab_path, 'wb') as f: + f.write(tmp.content) + else: + with open(self.vocab_path, 'rb') as f: + content = f.read() + if content == ERROR_503: + print('ERROR 503 saved as vocab - removing and trying to redownload') + os.remove(self.vocab_path) + self.check_or_download() diff --git a/tests/medmentions/make_cdb.py b/tests/medmentions/make_cdb.py index 52929b31f..b99c78fbc 100644 --- a/tests/medmentions/make_cdb.py +++ b/tests/medmentions/make_cdb.py @@ -5,6 +5,9 @@ import logging import os +from ..helper import VocabDownloader + + config = Config() config.general['log_level'] = logging.INFO config.general['spacy_model'] = 'en_core_sci_lg' @@ -21,12 +24,9 @@ from medcat.cdb import CDB from medcat.cat import CAT -vocab_path = "./tmp_vocab.dat" -if not os.path.exists(vocab_path): - import requests - tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") - with open(vocab_path, 'wb') as f: - f.write(tmp.content) +downloader = VocabDownloader() +vocab_path = downloader.vocab_path +downloader.check_or_download() config = Config() cdb = CDB.load("./tmp_cdb.dat", config=config) diff --git a/tests/test_ner.py b/tests/test_ner.py index 1ae6e375d..6f4d34e76 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -14,6 +14,8 @@ from medcat.config import Config from medcat.cdb import CDB +from .helper import VocabDownloader + class A_NERTests(unittest.TestCase): @classmethod @@ -25,11 +27,9 @@ def setUpClass(cls): cls.cdb = CDB(config=cls.config) print("Set up Vocab") - vocab_path = "./tmp_vocab.dat" - if not os.path.exists(vocab_path): - tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") - with open(vocab_path, 'wb') as f: - f.write(tmp.content) + downloader = VocabDownloader() + vocab_path = downloader.vocab_path + downloader.check_or_download() cls.vocab = Vocab.load(vocab_path) diff --git a/tests/test_pipe.py b/tests/test_pipe.py index 7f5bd2ece..583a6ecbf 100644 --- a/tests/test_pipe.py +++ b/tests/test_pipe.py @@ -17,6 +17,8 @@ from transformers import AutoTokenizer +from .helper import VocabDownloader + class PipeTests(unittest.TestCase): @@ -30,11 +32,9 @@ def setUpClass(cls) -> None: cls.config.linking['disamb_length_limit'] = 2 cls.cdb = CDB(config=cls.config) - vocab_path = "./tmp_vocab.dat" - if not os.path.exists(vocab_path): - tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") - with open(vocab_path, 'wb') as f: - f.write(tmp.content) + downloader = VocabDownloader() + vocab_path = downloader.vocab_path + downloader.check_or_download() cls.vocab = Vocab.load(vocab_path) cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab) From e9460b1a7e9fb0e0ccfb98e6e7413f0229692bc2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 22 Dec 2022 10:30:05 +0000 Subject: [PATCH 2/4] CU-862hyd5wx Remove unused imports in tests due to last commit --- tests/archive_tests/test_ner_archive.py | 1 - tests/medmentions/make_cdb.py | 1 - tests/test_ner.py | 2 -- tests/test_pipe.py | 2 -- 4 files changed, 6 deletions(-) diff --git a/tests/archive_tests/test_ner_archive.py b/tests/archive_tests/test_ner_archive.py index 4e44cb368..d41ccd0c7 100644 --- a/tests/archive_tests/test_ner_archive.py +++ b/tests/archive_tests/test_ner_archive.py @@ -1,5 +1,4 @@ import logging -import os import unittest import numpy as np from timeit import default_timer as timer diff --git a/tests/medmentions/make_cdb.py b/tests/medmentions/make_cdb.py index b99c78fbc..feb8629d2 100644 --- a/tests/medmentions/make_cdb.py +++ b/tests/medmentions/make_cdb.py @@ -3,7 +3,6 @@ from functools import partial import numpy as np import logging -import os from ..helper import VocabDownloader diff --git a/tests/test_ner.py b/tests/test_ner.py index 6f4d34e76..b5b185842 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -1,6 +1,4 @@ import logging -import os -import requests import unittest from spacy.lang.en import English from medcat.preprocessing.tokenizers import spacy_split_all diff --git a/tests/test_pipe.py b/tests/test_pipe.py index 583a6ecbf..e6da42898 100644 --- a/tests/test_pipe.py +++ b/tests/test_pipe.py @@ -1,7 +1,5 @@ import unittest import logging -import os -import requests from spacy.language import Language from medcat.cdb import CDB from medcat.vocab import Vocab From 6baecb691384e32d13fd5ccc8d04af18fd1ccfab Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 22 Dec 2022 11:26:15 +0000 Subject: [PATCH 3/4] CU-862hyd5wx Add possibility of generating and using a simply vocab when Rosalind is down --- tests/helper.py | 62 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/tests/helper.py b/tests/helper.py index 1f0615ed1..43b19c41c 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -2,6 +2,10 @@ import requests import unittest +import numpy as np + +from medcat.vocab import Vocab + class AsyncMock(unittest.mock.MagicMock): async def __call__(self, *args, **kwargs): @@ -19,22 +23,54 @@ async def __call__(self, *args, **kwargs): """ +SIMPLE_WORDS = """house 34444 0.3232 0.123213 1.231231 +dog 14444 0.76762 0.76767 1.45454""" + + +def generate_simple_vocab(): + v = Vocab() + # v.add_words() + for line in SIMPLE_WORDS.split('\n'): + parts = line.split("\t") + word = parts[0] + cnt = int(parts[1].strip()) + vec = None + if len(parts) == 3: + vec = np.array([float(x) for x in parts[2].strip().split(" ")]) + + v.add_word(word, cnt, vec, replace=True) + v.make_unigram_table() + return v + class VocabDownloader: url = 'https://medcat.rosalind.kcl.ac.uk/media/vocab.dat' vocab_path = "./tmp_vocab.dat" + _has_simple = False + + def is_valid(self): + with open(self.vocab_path, 'rb') as f: + content = f.read() + if content == ERROR_503: + return False + v = Vocab.load(self.vocab_path) + if len(v.vocab) == 2: # simple one + self._has_simple = True + return False + return True def check_or_download(self): - if not os.path.exists(self.vocab_path): - tmp = requests.get(self.url) - if tmp.content == ERROR_503: - raise AssertionError("Rosalind server not available!") - with open(self.vocab_path, 'wb') as f: - f.write(tmp.content) - else: - with open(self.vocab_path, 'rb') as f: - content = f.read() - if content == ERROR_503: - print('ERROR 503 saved as vocab - removing and trying to redownload') - os.remove(self.vocab_path) - self.check_or_download() + if os.path.exists(self.vocab_path) and self.is_valid(): + return + tmp = requests.get(self.url) + if tmp.content == ERROR_503: + print('Rosalind server unavailable') + if self._has_simple: + print('Local simple vocab already presetn') + return + print('Generating local simple vocab instead') + v = generate_simple_vocab() + v.save(self.vocab_path) + return + with open(self.vocab_path, 'wb') as f: + f.write(tmp.content) From 9582bc7670a4d4afbb70515f2e738f9bcd7fea72 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 22 Dec 2022 11:31:04 +0000 Subject: [PATCH 4/4] CU-862hyd5wx Fix small typo in tests --- tests/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/helper.py b/tests/helper.py index 43b19c41c..483a6d1ad 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -66,7 +66,7 @@ def check_or_download(self): if tmp.content == ERROR_503: print('Rosalind server unavailable') if self._has_simple: - print('Local simple vocab already presetn') + print('Local simple vocab already present') return print('Generating local simple vocab instead') v = generate_simple_vocab()