Release 1.2.2 (#160)

* Update README.md * Switching the base model to spacy instead of scispacy * Added the spacy model to requirements * Removing scispacy dependencies * Updated tests to use spacy instead of scispacy * Fix version in readme * Text fix for spacy models * One more model rename in tests * Fix pipeline for creating vocab with gensim 4.1.2 * Upgrade for repair_cdb * Multiprocessing should not disable torch threading if separate_nn_components * Added option for entity grouping * Some shady things with print stats and train supervised * Fix bug in linker re: context_ignore_center_tokens * Rename TUIs to type_ids * Fix for extra_cui_filters in supervised training * comments * Fix for arguments retaining state in functions * Add linting to the GH Actions workflow (#158) * CAT-77 add linting as part of CI * CAT-77 make linting mandatory * CAT-77 fix linting issues * dependency version update Co-authored-by: Zeljko <w.kraljevic@gmail.com> Co-authored-by: Sander Tan <s.c.tan-3@umcutrecht.nl>
CogStack · Oct 25, 2021 · ed2ec93 · ed2ec93
1 parent dc1760a
commit ed2ec93
Show file tree

Hide file tree

Showing 55 changed files with 513 additions and 490 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,17 @@
+[flake8]
+extend-ignore =
+    E124,   ; closing bracket does not match visual indentation
+    E127,   ; continuation line over-indented for visual indent
+    E128,   ; continuation line under-indented for visual indent
+    E221,   ; multiple spaces before operator
+    E225,   ; missing whitespace around operator
+    E231,   ; missing whitespace after ',' and ':'
+    E252,   ; missing whitespace around parameter equal
+    E261,   ; at least two spaces before inline comment
+    E265,   ; block comment should start with '# '
+    E272,   ; multiple spaces before keyword
+    E303,   ; too many blank lines
+    E501,   ; line too long
+    W291,   ; trailing whitespace
+    W605,   ; invalid escape sequence
+
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -25,13 +25,20 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements-dev.txt
+      - name: Lint
+        run: |
+          flake8 medcat
       - name: Test
         run: |
           python -m unittest discover
 
   publish-to-test-pypi:
 
-    if: github.repository == 'CogStack/MedCAT' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') != true
+    if: |
+      github.repository == 'CogStack/MedCAT' &&
+      github.ref == 'refs/heads/master' &&
+      github.event_name == 'push' &&
+      startsWith(github.ref, 'refs/tags') != true
     runs-on: ubuntu-18.04
     needs: [build]
 

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ A guide on how to use MedCAT is available in the [tutorial](https://github.com/C
 - For macOS/linux: `pip install --upgrade medcat`
 - For Windows (see [PyTorch documentation](https://pytorch.org/get-started/previous-versions/)): `pip install --upgrade medcat -f https://download.pytorch.org/whl/torch_stable.html`
 
-2. Quickstart (v1.5+):
+2. Quickstart (MedCAT v1.2+):
 ```python
 from medcat.cat import CAT
 
@@ -55,12 +55,10 @@ cat.create_model_pack(<save path>)
 
 
 3. Quick start with separate models:
-First download scispacy models
+New Models (MedCAT v1.2+) need the spacy `en_core_web_md` while older ones use the scispacy models, install the one you need or all if not sure. If using model packs you do not need to download these models: 
 ```
+python -m spacy download en_core_web_md
 pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
-```
-or
-```
 pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
 ```
 ```python
@@ -103,7 +101,7 @@ A basic trained model is made public. It contains ~ 35K concepts available in `M
 
 - Vocabulary [Download](https://medcat.rosalind.kcl.ac.uk/media/vocab.dat) - Built from MedMentions
 
-- CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1_5.dat) - Built from MedMentions
+- CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1_2.dat) - Built from MedMentions
 
 - MetaCAT Status [Download](https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical)
 

diff --git a/medcat/__init__.py b/medcat/__init__.py
@@ -2,7 +2,8 @@
 
 # Hacky patch to the built-in copy module coz otherwise, thinc.config.Config.copy will fail on Python <= 3.6.
 # (fixed in python 3.7 https://docs.python.org/3/whatsnew/3.7.html#re)
-import sys
+import sys # noqa
+
 if sys.version_info.major == 3 and sys.version_info.minor <= 6:
     import copy
     import re

diff --git a/medcat/cat.py b/medcat/cat.py
diff --git a/medcat/cdb.py b/medcat/cdb.py
@@ -4,10 +4,10 @@
 import dill
 import logging
 import numpy as np
-from typing import Dict, List, Set
+from typing import Dict, Set
 from functools import partial
 
-from medcat.utils.matutils import unitvec, sigmoid
+from medcat.utils.matutils import unitvec
 from medcat.utils.ml_utils import get_lr_linking
 from medcat.config import Config, weighted_average, workers
 
@@ -49,6 +49,7 @@ class CDB(object):
             Stores all the words tha appear in this CDB and the count for each one.
     """
     log = logging.getLogger(__name__)
+
     def __init__(self, config):
         self.config = config
         self.name2cuis = {}
@@ -82,7 +83,6 @@ def __init__(self, config):
         self.vocab = {} # Vocabulary of all words ever in our cdb
         self._optim_params = None
 
-
     def get_name(self, cui):
         r''' Returns preferred name if it exists, otherwise it will return
         the logest name assigend to the concept.
@@ -100,12 +100,10 @@ def get_name(self, cui):
 
         return name
 
-
     def update_cui2average_confidence(self, cui, new_sim):
-        self.cui2average_confidence[cui] = (self.cui2average_confidence.get(cui, 0) * self.cui2count_train.get(cui, 0) + new_sim)  / \
+        self.cui2average_confidence[cui] = (self.cui2average_confidence.get(cui, 0) * self.cui2count_train.get(cui, 0) + new_sim) / \
                                             (self.cui2count_train.get(cui, 0) + 1)
 
-
     def remove_names(self, cui: str, names: Dict):
         r''' Remove names from an existing concept - efect is this name will never again be used to link to this concept.
         This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else.
@@ -141,7 +139,6 @@ def remove_names(self, cui: str, names: Dict):
                         elif self.name2cuis2status[name][_cui] == 'P':
                             self.name2cuis2status[name][_cui] = 'PD'
 
-
     def add_names(self, cui: str, names: Dict, name_status: str='A', full_build: bool=False):
         r''' Adds a name to an existing concept.
 
@@ -165,7 +162,6 @@ def add_names(self, cui: str, names: Dict, name_status: str='A', full_build: boo
 
         self.add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)
 
-
     def add_concept(self, cui: str, names: Dict, ontologies: set, name_status: str, type_ids: Set[str], description: str, full_build: bool=False):
         r'''
         Add a concept to internal Concept Database (CDB). Depending on what you are providing
@@ -192,7 +188,7 @@ def add_concept(self, cui: str, names: Dict, ontologies: set, name_status: str,
         '''
         # Add CUI to the required dictionaries
         if cui not in self.cui2names:
-            # Create placeholders 
+            # Create placeholders
             self.cui2names[cui] = set()
             self.cui2snames[cui] = set()
 
@@ -226,7 +222,7 @@ def add_concept(self, cui: str, names: Dict, ontologies: set, name_status: str,
                     # If CUI is not already linked do it
                     self.name2cuis[name].append(cui)
 
-                    # At the same time it means the cui is also missing from name2cuis2status, but the 
+                    # At the same time it means the cui is also missing from name2cuis2status, but the
                     #name is there
                     self.name2cuis2status[name][cui] = name_status
                 elif name_status == 'P':
@@ -236,7 +232,7 @@ def add_concept(self, cui: str, names: Dict, ontologies: set, name_status: str,
                 # Means we never saw this name
                 self.name2cuis[name] = [cui]
 
-               # Add name2cuis2status
+                # Add name2cuis2status
                 self.name2cuis2status[name] = {cui: name_status}
 
 
@@ -258,13 +254,17 @@ def add_concept(self, cui: str, names: Dict, ontologies: set, name_status: str,
         if full_build:
             # Use original_names as the base check because they must be added
             if cui not in self.addl_info['cui2original_names']:
-                if ontologies: self.addl_info['cui2ontologies'][cui] = ontologies
-                if description: self.addl_info['cui2description'][cui] = description
-                self.addl_info['cui2original_names'][cui] = set([v['raw_name'] for k,v in names.items()])
+                if ontologies:
+                    self.addl_info['cui2ontologies'][cui] = ontologies
+                if description:
+                    self.addl_info['cui2description'][cui] = description
+                self.addl_info['cui2original_names'][cui] = set([v['raw_name'] for k, v in names.items()])
             else:
                 # Update existing ones
-                if ontologies: self.addl_info['cui2ontologies'][cui].update(ontologies)
-                if description: self.addl_info['cui2description'][cui] = description
+                if ontologies:
+                    self.addl_info['cui2ontologies'][cui].update(ontologies)
+                if description:
+                    self.addl_info['cui2description'][cui] = description
                 self.addl_info['cui2original_names'][cui].update([v['raw_name'] for k,v in names.items()])
 
             for type_id in type_ids:
@@ -274,7 +274,6 @@ def add_concept(self, cui: str, names: Dict, ontologies: set, name_status: str,
                 else:
                     self.addl_info['type_id2cuis'][type_id] = {cui}
 
-
     def add_addl_info(self, name, data, reset_existing=False):
         r''' Add data to the addl_info dictionary. This is done in a function to
         not directly access the addl_info dictionary.
@@ -292,7 +291,6 @@ def add_addl_info(self, name, data, reset_existing=False):
 
         self.addl_info[name].update(data)
 
-
     def update_context_vector(self, cui, vectors, negative=False, lr=None, cui_count=0):
         r''' Add the vector representation of a context for this CUI.
 
@@ -321,7 +319,7 @@ def update_context_vector(self, cui, vectors, negative=False, lr=None, cui_count
 
                 # Get the learning rate if None
                 if lr is None:
-                    lr = get_lr_linking(self.config, self.cui2count_train[cui] + cui_count, self._optim_params, similarity)
+                    lr = get_lr_linking(self.config, self.cui2count_train[cui] + cui_count)
 
                 if negative:
                     # Add negative context
@@ -332,29 +330,28 @@ def update_context_vector(self, cui, vectors, negative=False, lr=None, cui_count
                     self.cui2context_vectors[cui][context_type] = cv*(1-b) + vector*b
 
                 # DEBUG
-                self.log.debug("Updated vector embedding.\n" + \
-                        "CUI: {}, Context Type: {}, Similarity: {:.2f}, Is Negative: {}, LR: {:.5f}, b: {:.3f}".format(cui, context_type,
+                self.log.debug("Updated vector embedding.\n" +
+                        "CUI: %s, Context Type: %s, Similarity: %.2f, Is Negative: %s, LR: %.5f, b: %.3f", (cui, context_type,
                             similarity, negative, lr, b))
                 cv = self.cui2context_vectors[cui][context_type]
                 similarity_after = np.dot(unitvec(cv), unitvec(vector))
-                self.log.debug("Similarity before vs after: {:.5f} vs {:.5f}".format(similarity, similarity_after))
+                self.log.debug("Similarity before vs after: %.5f vs %.5f", (similarity, similarity_after))
             else:
                 if negative:
                     self.cui2context_vectors[cui][context_type] = -1 * vector
                 else:
                     self.cui2context_vectors[cui][context_type] = vector
 
                 # DEBUG
-                self.log.debug("Added new context type with vectors.\n" + \
-                        "CUI: {}, Context Type: {}, Is Negative: {}".format(cui, context_type, negative))
+                self.log.debug("Added new context type with vectors.\n" +
+                        "CUI: %s, Context Type: %s, Is Negative: %s", (cui, context_type, negative))
 
         if not negative:
             # Increase counter only for positive examples
             self.cui2count_train[cui] += 1
 
-
     def save(self, path):
-        r''' Saves model to file (in fact it saves vairables of this class). 
+        r''' Saves model to file (in fact it saves vairables of this class).
 
         Args:
             path (`str`):
@@ -367,7 +364,6 @@ def save(self, path):
             to_save['cdb'] = {k:v for k,v in self.__dict__.items() if k != 'config'}
             dill.dump(to_save, f)
 
-
     @classmethod
     def load(cls, path, config=None):
         r''' Load and return a CDB. This allows partial loads in probably not the right way at all.
@@ -393,7 +389,6 @@ def load(cls, path, config=None):
 
         return cdb
 
-
     def import_old_cdb_vectors(self, cdb):
         # Import context vectors
         for cui in self.cui2names: # Loop through all CUIs in the current CDB
@@ -407,7 +402,6 @@ def import_old_cdb_vectors(self, cdb):
 
                 self.cui2count_train[cui] = cdb.cui_count[cui]
 
-
     def import_old_cdb(self, cdb, import_vectors=True):
         r''' Import all data except for cuis and names from an old CDB.
         '''
@@ -443,7 +437,6 @@ def import_old_cdb(self, cdb, import_vectors=True):
         # Import cui 2 ontologies
         self.addl_info['cui2ontologies'] = cdb.cui2ontos
 
-
     def import_training(self, cdb, overwrite=True):
         r''' This will import vector embeddings from another CDB. No new concepts will be added.
         IMPORTANT it will not import name maps (cui2names, name2cuis or anything else) only vectors.
@@ -470,7 +463,6 @@ def import_training(self, cdb, overwrite=True):
                 # Increase the vector count
                 self.cui2count_train[cui] = self.cui2count_train.get(cui, 0) + cdb.cui2count_train[cui]
 
-
     def reset_cui_count(self, n=10):
         r''' Reset the CUI count for all concepts that received training, used when starting new unsupervised training
         or for suppervised with annealing.
@@ -485,7 +477,6 @@ def reset_cui_count(self, n=10):
         for cui in self.cui2count_train.keys():
             self.cui2count_train[cui] = n
 
-
     def reset_training(self):
         r''' Will remove all training efforts - in other words all embeddings that are learnt
         for concepts in the current CDB. Please note that this does not remove synonyms (names) that were
@@ -495,7 +486,6 @@ def reset_training(self):
         self.cui2context_vectors = {}
         self.reset_concept_similarity()
 
-
     def filter_by_cui(self, cuis_to_keep):
         ''' Subset the core CDB fields (dictionaries/maps). Note that this will potenitally keep a bit more CUIs
         then in cuis_to_keep. It will first find all names that link to the cuis_to_keep and then
@@ -566,7 +556,6 @@ def filter_by_cui(self, cuis_to_keep):
         self.cui2type_ids = new_cui2type_ids
         self.cui2preferred_name = new_cui2preferred_name
 
-
     def print_stats(self):
         r'''Print basic statistics for the CDB.
         '''
@@ -577,13 +566,11 @@ def print_stats(self):
         self.log.info("Average training examples per concept:     {:.1f}".format(np.average(
             [self.cui2count_train[cui] for cui in self.cui2count_train if self.cui2count_train[cui] > 0])))
 
-
     def reset_concept_similarity(self):
         r''' Reset concept similarity matrix.
         '''
         self.addl_info['similarity'] = {}
 
-
     def most_similar(self, cui, context_type, type_id_filter=[], min_cnt=0, topn=50, force_build=False):
         r''' Given a concept it will calculate what other concepts in this CDB have the most similar
         embedding.
@@ -649,7 +636,7 @@ def most_similar(self, cui, context_type, type_id_filter=[], min_cnt=0, topn=50,
         cnt_inds = np.arange(0, len(sim_data['sim_vectors_counts']))
         if min_cnt > 0:
             cnt_inds = np.where(sim_data['sim_vectors_counts'] >= min_cnt)[0]
-        # Intersect cnt and type_id 
+        # Intersect cnt and type_id
         inds = np.intersect1d(type_id_inds, cnt_inds)
 
         mtrx = sim_data['sim_vectors'][inds]

diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
@@ -1,9 +1,7 @@
 import pandas
-import spacy
 import numpy as np
 import datetime
 import logging
-from functools import partial
 import re
 
 from medcat.pipe import Pipe
@@ -15,6 +13,7 @@
 
 PH_REMOVE = re.compile("(\s)\([a-zA-Z]+[^\)\(]*\)($)")
 
+
 class CDBMaker(object):
     r''' Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/<example> it creates a CDB or
     updates an exisitng one.
@@ -30,7 +29,7 @@ class CDBMaker(object):
     log = logging.getLogger(__package__)
     log = add_handlers(log)
 
-    def __init__(self, config, cdb=None, name_max_words=20):
+    def __init__(self, config, cdb=None):
         self.config = config
         # Set log level
         self.log.setLevel(self.config.general['log_level'])
@@ -151,7 +150,7 @@ def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index
                     # We can have multiple versions of a name
                     names = {} # {'name': {'tokens': [<str>], 'snames': [<str>]}}
 
-                    raw_names = [raw_name.strip() for raw_name in row[col2ind['name']].split(self.cnf_cm['multi_separator']) if 
+                    raw_names = [raw_name.strip() for raw_name in row[col2ind['name']].split(self.cnf_cm['multi_separator']) if
                                  len(raw_name.strip()) > 0]
                     for raw_name in raw_names:
                         raw_name = raw_name.strip()
@@ -166,9 +165,8 @@ def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index
                     self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids,
                                          description=description, full_build=full_build)
                     # DEBUG
-                    self.log.debug("\n\n**** Added\n CUI: {}\n Names: {}\n Ontologies: {}\n Name status: {}\n".format(cui, names, ontologies, name_status) + \
-                                   " Type IDs: {}\n Description: {}\n Is full build: {}".format(
-                                   type_ids, description, full_build))
+                    self.log.debug("\n\n**** Added\n CUI: %s\n Names: %s\n Ontologies: %s\n Name status: %s\n Type IDs: %s\n Description: %s\n Is full build: %s",
+                                   (cui, names, ontologies, name_status, type_ids, description, full_build))
 
         return self.cdb