CogStack · mart-r · Apr 18, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/.flake8 b/.flake8
@@ -1,18 +1,32 @@
 [flake8]
 extend-ignore =
-    E124,   ; closing bracket does not match visual indentation
-    E127,   ; continuation line over-indented for visual indent
-    E128,   ; continuation line under-indented for visual indent
-    E221,   ; multiple spaces before operator
-    E225,   ; missing whitespace around operator
-    E231,   ; missing whitespace after ',' and ':'
-    E252,   ; missing whitespace around parameter equal
-    E261,   ; at least two spaces before inline comment
-    E265,   ; block comment should start with '# '
-    E272,   ; multiple spaces before keyword
-    E303,   ; too many blank lines
-    E501,   ; line too long
-    W291,   ; trailing whitespace
-    W605,   ; invalid escape sequence
+    E124,
+       ; closing bracket does not match visual indentation
+    E127,
+       ; continuation line over-indented for visual indent
+    E128,
+       ; continuation line under-indented for visual indent
+    E221,
+       ; multiple spaces before operator
+    E225,
+       ; missing whitespace around operator
+    E231,
+       ; missing whitespace after ',' and ':'
+    E252,
+       ; missing whitespace around parameter equal
+    E261,
+       ; at least two spaces before inline comment
+    E265,
+       ; block comment should start with '# '
+    E272,
+       ; multiple spaces before keyword
+    E303,
+       ; too many blank lines
+    E501,
+       ; line too long
+    W291,
+       ; trailing whitespace
+    W605,
+       ; invalid escape sequence
 
 per-file-ignores = __init__.py:F401
diff --git a/medcat/cat.py b/medcat/cat.py
diff --git a/medcat/cdb.py b/medcat/cdb.py
@@ -112,7 +112,11 @@ def get_name(self, cui: str) -> str:
         the longest name assigned to the concept.
 
         Args:
-            cui
+            cui (str):
+                Concept ID or unique identifer in this database.
+
+        Returns:
+            str: The name of the concept.
         """
         name = cui # In case we do not find anything it will just return the CUI
 
@@ -128,7 +132,7 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None:
                                             (self.cui2count_train.get(cui, 0) + 1)
         self.is_dirty = True
 
-    def remove_names(self, cui: str, names: Dict) -> None:
+    def remove_names(self, cui: str, names: Dict[str, Dict]) -> None:
         """Remove names from an existing concept - effect is this name will never again be used to link to this concept.
         This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else.
         Why? Because it is bothersome to remove it from everywhere, but
@@ -166,8 +170,10 @@ def remove_names(self, cui: str, names: Dict) -> None:
 
     def remove_cui(self, cui: str) -> None:
         """This function takes a `CUI` as an argument and removes it from all the internal objects that reference it.
+
         Args:
-            cui
+            cui (str):
+                Concept ID or unique identifer in this database.
         """
         if cui in self.cui2names:
             del self.cui2names[cui]
@@ -201,7 +207,7 @@ def remove_cui(self, cui: str) -> None:
         self.name2count_train = {name: len(cuis) for name, cuis in self.name2cuis.items()}
         self.is_dirty = True
 
-    def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: bool = False) -> None:
+    def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', full_build: bool = False) -> None:
         """Adds a name to an existing concept.
 
         Args:
@@ -212,8 +218,8 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
                 Names for this concept, or the value that if found in free text can be linked to this concept.
                 Names is an dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
             name_status (str):
-                One of `P`, `N`, `A`
-            full_build (bool)):
+                One of `P`, `N`, `A`.
+            full_build (bool):
                 If True the dictionary self.addl_info will also be populated, contains a lot of extra information
                 about concepts, but can be very memory consuming. This is not necessary
                 for normal functioning of MedCAT (Default value `False`).
@@ -228,8 +234,8 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
     @deprecated("Use `cdb._add_concept` as this will be removed in a future release.")
     def add_concept(self,
                     cui: str,
-                    names: Dict,
-                    ontologies: set,
+                    names: Dict[str, Dict],
+                    ontologies: Set[str],
                     name_status: str,
                     type_ids: Set[str],
                     description: str,
@@ -265,8 +271,8 @@ def add_concept(self,
 
     def _add_concept(self,
                     cui: str,
-                    names: Dict,
-                    ontologies: set,
+                    names: Dict[str, Dict],
+                    ontologies: Set[str],
                     name_status: str,
                     type_ids: Set[str],
                     description: str,
@@ -294,6 +300,9 @@ def _add_concept(self,
                 If True the dictionary self.addl_info will also be populated, contains a lot of extra information
                 about concepts, but can be very memory consuming. This is not necessary
                 for normal functioning of MedCAT (Default Value `False`).
+
+        Raises:
+            ValueError: If there is no name info yet `names` dict is not empty.
         """
         # Add CUI to the required dictionaries
         if cui not in self.cui2names:
@@ -406,7 +415,7 @@ def add_addl_info(self, name: str, data: Dict, reset_existing: bool = False) ->
         Args:
             name (str):
                 What key should be used in the `addl_info` dictionary.
-            data (Dict[<whatever>]):
+            data (Dict):
                 What will be added as the value for the key `name`
             reset_existing (bool):
                 Should old data be removed if it exists
@@ -425,18 +434,19 @@ def update_context_vector(self,
                               cui_count: int = 0) -> None:
         """Add the vector representation of a context for this CUI.
 
-        cui (str):
-            The concept in question.
-        vectors (Dict[str, numpy.ndarray]):
-            Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
-            context_type - is usually one of: ['long', 'medium', 'short']
-        negative (bool):
-            Is this negative context of positive (Default Value `False`).
-        lr (int):
-            If set it will override the base value from the config file.
-        cui_count (int):
-            The learning rate will be calculated based on the count for the provided CUI + cui_count.
-            Defaults to 0.
+        Args:
+            cui (str):
+                The concept in question.
+            vectors (Dict[str, np.ndarray]):
+                Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
+                context_type - is usually one of: ['long', 'medium', 'short']
+            negative (bool):
+                Is this negative context of positive (Default Value `False`).
+            lr (Optional[float]):
+                If set it will override the base value from the config file.
+            cui_count (int):
+                The learning rate will be calculated based on the count for the provided CUI + cui_count.
+                Defaults to 0.
         """
         if cui not in self.cui2context_vectors:
             self.cui2context_vectors[cui] = {}
@@ -565,6 +575,9 @@ def load(cls, path: str, json_path: Optional[str] = None, config_dict: Optional[
                 Path to the JSON serialized folder
             config_dict:
                 A dictionary that will be used to overwrite existing fields in the config of this CDB
+
+        Returns:
+            CDB: The resulting concept database.
         """
         ser = CDBSerializer(path, json_path)
         cdb = ser.deserialize(CDB)
@@ -582,7 +595,7 @@ def import_training(self, cdb: "CDB", overwrite: bool = True) -> None:
         IMPORTANT it will not import name maps (cui2names, name2cuis or anything else) only vectors.
 
         Args:
-            cdb (medcat.cdb.CDB):
+            cdb (CDB):
                 Concept database from which to import training vectors
             overwrite (bool):
                 If True all training data in the existing CDB will be overwritten, else
@@ -641,7 +654,7 @@ def populate_cui2snames(self, force: bool = True) -> None:
         cui2names into cui2snames.
 
         Args:
-            force (bool, optional): Whether to force the (re-)population. Defaults to True.
+            force (bool): Whether to force the (re-)population. Defaults to True.
         """
         if not force and self.cui2snames:
             return
@@ -664,8 +677,11 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None:
         However, the memory optimisation can be performed again afterwards.
 
         Args:
-            cuis_to_keep (List[str]):
+            cuis_to_keep (Union[List[str], Set[str]]):
                 CUIs that will be kept, the rest will be removed (not completely, look above).
+
+        Raises:
+            Exception: If no snames and subsetting is not possible.
         """
 
         if not self.cui2snames:
@@ -756,7 +772,7 @@ def most_similar(self,
                      min_cnt: int = 0,
                      topn: int = 50,
                      force_build: bool = False) -> Dict:
-        """Given a concept it will calculate what other concepts in this CDB have the most similar
+        r"""Given a concept it will calculate what other concepts in this CDB have the most similar
         embedding.
 
         Args:
@@ -776,10 +792,9 @@ def most_similar(self,
                 Do not use cached sim matrix (Default value False)
 
         Returns:
-            results (Dict):
-                A dictionary with topn results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'type_name': <type_name>,
+            Dict:
+                A dictionary with top results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'type_name': <type_name>,
                                                               'type_id': <type_id>, 'cnt': <number of training examples the concept has seen>}, ...}
-
         """
 
         if 'similarity' in self.addl_info:

diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
@@ -3,7 +3,7 @@
 import datetime
 import logging
 import re
-from typing import Optional, List, Dict, Union
+from typing import Optional, List, Dict, Union, Any
 
 from medcat.pipe import Pipe
 from medcat.cdb import CDB
@@ -64,7 +64,7 @@ def prepare_csvs(self,
                      escapechar: Optional[str] = None,
                      index_col: bool = False,
                      full_build: bool = False,
-                     only_existing_cuis: bool = False, **kwargs) -> CDB:
+                     only_existing_cuis: bool = False, **kwargs: Any) -> CDB:
         r"""Compile one or multiple CSVs into a CDB.
 
         Note: This class/method generally uses the same instance of the CDB.
@@ -76,30 +76,33 @@ def prepare_csvs(self,
         Args:
             csv_paths (Union[pd.DataFrame, List[str]]):
                 An array of paths to the csv files that should be processed. Can also be an array of pd.DataFrames
-            full_build (bool):
-                If False only the core portions of the CDB will be built (the ones required for
-                the functioning of MedCAT). If True, everything will be added to the CDB - this
-                usually includes concept descriptions, various forms of names etc (take care that
-                this option produces a much larger CDB) (Default value False).
             sep (str):
                 If necessary a custom separator for the csv files (Default value ',').
-            encoding (str):
+            encoding (Optional[str]):
                 Encoding to be used for reading the CSV file (Default value `None`).
-            escapechar (str):
+            escapechar (Optional[str]):
                 Escape char for the CSV (Default value None).
             index_col (bool):
                 Index column for pandas read_csv (Default value False).
-            only_existing_cuis bool):
+            full_build (bool):
+                If False only the core portions of the CDB will be built (the ones required for
+                the functioning of MedCAT). If True, everything will be added to the CDB - this
+                usually includes concept descriptions, various forms of names etc (take care that
+                this option produces a much larger CDB) (Default value False).
+            only_existing_cuis (bool):
                 If True no new CUIs will be added, but only linked names will be extended. Mainly used when
                 enriching names of a CDB (e.g. SNOMED with UMLS terms) (Default value `False`).
-        Returns:
-            medcat.cdb.CDB: CDB with the new concepts added.
+            kwargs (Any):
+                Will be passed to pandas for CSV reading
 
         Note:
             \*\*kwargs:
                 Will be passed to pandas for CSV reading
             csv:
                 Examples of the CSV used to make the CDB can be found on [GitHub](link)
+
+        Returns:
+            CDB: CDB with the new concepts added.
         """
 
         useful_columns = ['cui', 'name', 'ontologies', 'name_status', 'type_ids', 'description']

diff --git a/medcat/config.py b/medcat/config.py
@@ -143,7 +143,10 @@ def parse_config_file(self, path: str, extractor: ValueExtractor = _DEFAULT_EXTR
 
         Args:
             path(str): the path to the config file
-            extractor(ValueExtractor, optional):  (Default value = _DEFAULT_EXTRACTOR)
+            extractor(ValueExtractor):  (Default value = _DEFAULT_EXTRACTOR)
+
+        Raises:
+            ValueError: In case of unknown attribute.
         """
         with open(path, 'r') as f:
             for line in f:
@@ -233,7 +236,7 @@ def fields(self) -> Dict[str, ModelField]:
         """Get the fields associated with this config.
 
         Returns:
-            Dict[str, Field]: The dictionary of the field names and fields
+            Dict[str, ModelField]: The dictionary of the field names and fields
         """
         return cast(BaseModel, self).__fields__
 

diff --git a/medcat/datasets/medcat_annotations.py b/medcat/datasets/medcat_annotations.py
@@ -66,7 +66,7 @@ def _info(self):
         )
 
     def _split_generators(self, dl_manager): # noqa
-        """Returns SplitGenerators."""
+        """Returns SplitGenerators."""  # noqa
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
@@ -77,7 +77,7 @@ def _split_generators(self, dl_manager): # noqa
         ]
 
     def _generate_examples(self, filepath):
-        """This function returns the examples in the raw (text) form."""
+        """This function returns the examples in the raw (text) form."""  # noqa
         logging.info("generating examples from = %s", filepath)
         with open(filepath, 'rb') as f:
             docs = pickle.load(f)

diff --git a/medcat/datasets/patient_concept_stream.py b/medcat/datasets/patient_concept_stream.py
@@ -63,7 +63,7 @@ def _info(self):
         )
 
     def _split_generators(self, dl_manager): # noqa
-        """Returns SplitGenerators."""
+        """Returns SplitGenerators."""  # noqa
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
@@ -74,7 +74,7 @@ def _split_generators(self, dl_manager): # noqa
         ]
 
     def _generate_examples(self, filepath):
-        """This function returns the examples in the raw (text) form."""
+        """This function returns the examples in the raw (text) form.""" # noqa
         logging.info("generating examples from = %s", filepath)
         with open(filepath, 'rb') as f:
             pt2stream = pickle.load(f)

diff --git a/medcat/datasets/transformers_ner.py b/medcat/datasets/transformers_ner.py
@@ -63,7 +63,7 @@ def _info(self):
         )
 
     def _split_generators(self, dl_manager): # noqa
-        """Returns SplitGenerators."""
+        """Returns SplitGenerators.""" # noqa
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
@@ -73,7 +73,7 @@ def _split_generators(self, dl_manager): # noqa
             ),
         ]
 
-    def _generate_examples(self, filepaths):
+    def _generate_examples(self, filepaths): # noqa
         cnt = 0
         for filepath in filepaths:
             logging.info("generating examples from = %s", filepath)