Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-862hyd5wx Rosalind down #287

Merged
merged 4 commits into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions tests/archive_tests/test_ner_archive.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os
import unittest
import numpy as np
from timeit import default_timer as timer
Expand All @@ -15,6 +14,8 @@
from medcat.linking.context_based_linker import Linker
from medcat.config import Config

from ..helper import VocabDownloader


class NerArchiveTests(unittest.TestCase):

Expand All @@ -35,12 +36,9 @@ def setUp(self) -> None:
# Check
#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

self.vocab_path = "./tmp_vocab.dat"
if not os.path.exists(self.vocab_path):
import requests
tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
with open(self.vocab_path, 'wb') as f:
f.write(tmp.content)
downloader = VocabDownloader()
self.vocab_path = downloader.vocab_path
downloader.check_or_download()

vocab = Vocab.load(self.vocab_path)
# Make the pipeline
Expand Down
70 changes: 70 additions & 0 deletions tests/helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,76 @@
import os
import requests
import unittest

import numpy as np

from medcat.vocab import Vocab


class AsyncMock(unittest.mock.MagicMock):
async def __call__(self, *args, **kwargs):
return super().__call__(*args, **kwargs)


ERROR_503 = b"""<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>503 Service Unavailable</title>
</head><body>
<h1>Service Unavailable</h1>
<p>The server is temporarily unable to service your
request due to maintenance downtime or capacity
problems. Please try again later.</p>
</body></html>
"""

SIMPLE_WORDS = """house 34444 0.3232 0.123213 1.231231
dog 14444 0.76762 0.76767 1.45454"""


def generate_simple_vocab():
v = Vocab()
# v.add_words()
for line in SIMPLE_WORDS.split('\n'):
parts = line.split("\t")
word = parts[0]
cnt = int(parts[1].strip())
vec = None
if len(parts) == 3:
vec = np.array([float(x) for x in parts[2].strip().split(" ")])

v.add_word(word, cnt, vec, replace=True)
v.make_unigram_table()
return v


class VocabDownloader:
url = 'https://medcat.rosalind.kcl.ac.uk/media/vocab.dat'
vocab_path = "./tmp_vocab.dat"
_has_simple = False

def is_valid(self):
with open(self.vocab_path, 'rb') as f:
content = f.read()
if content == ERROR_503:
return False
v = Vocab.load(self.vocab_path)
if len(v.vocab) == 2: # simple one
self._has_simple = True
return False
return True

def check_or_download(self):
if os.path.exists(self.vocab_path) and self.is_valid():
return
tmp = requests.get(self.url)
if tmp.content == ERROR_503:
print('Rosalind server unavailable')
if self._has_simple:
print('Local simple vocab already present')
return
print('Generating local simple vocab instead')
v = generate_simple_vocab()
v.save(self.vocab_path)
return
with open(self.vocab_path, 'wb') as f:
f.write(tmp.content)
13 changes: 6 additions & 7 deletions tests/medmentions/make_cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from functools import partial
import numpy as np
import logging
import os

from ..helper import VocabDownloader


config = Config()
config.general['log_level'] = logging.INFO
Expand All @@ -21,12 +23,9 @@
from medcat.cdb import CDB
from medcat.cat import CAT

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
import requests
tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
with open(vocab_path, 'wb') as f:
f.write(tmp.content)
downloader = VocabDownloader()
vocab_path = downloader.vocab_path
downloader.check_or_download()

config = Config()
cdb = CDB.load("./tmp_cdb.dat", config=config)
Expand Down
12 changes: 5 additions & 7 deletions tests/test_ner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import logging
import os
import requests
import unittest
from spacy.lang.en import English
from medcat.preprocessing.tokenizers import spacy_split_all
Expand All @@ -14,6 +12,8 @@
from medcat.config import Config
from medcat.cdb import CDB

from .helper import VocabDownloader


class A_NERTests(unittest.TestCase):
@classmethod
Expand All @@ -25,11 +25,9 @@ def setUpClass(cls):
cls.cdb = CDB(config=cls.config)

print("Set up Vocab")
vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
with open(vocab_path, 'wb') as f:
f.write(tmp.content)
downloader = VocabDownloader()
vocab_path = downloader.vocab_path
downloader.check_or_download()

cls.vocab = Vocab.load(vocab_path)

Expand Down
12 changes: 5 additions & 7 deletions tests/test_pipe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import unittest
import logging
import os
import requests
from spacy.language import Language
from medcat.cdb import CDB
from medcat.vocab import Vocab
Expand All @@ -17,6 +15,8 @@
from transformers import AutoTokenizer


from .helper import VocabDownloader


class PipeTests(unittest.TestCase):

Expand All @@ -30,11 +30,9 @@ def setUpClass(cls) -> None:
cls.config.linking['disamb_length_limit'] = 2
cls.cdb = CDB(config=cls.config)

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
with open(vocab_path, 'wb') as f:
f.write(tmp.content)
downloader = VocabDownloader()
vocab_path = downloader.vocab_path
downloader.check_or_download()

cls.vocab = Vocab.load(vocab_path)
cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab)
Expand Down