CogStack · mart-r · May 1, 2024 · Aug 24, 2021 · Aug 31, 2021 · Sep 1, 2021
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,9 @@ venv
 db.sqlite3
 .ipynb_checkpoints
 
+# vscode 
+.vscode
+
 #tmp and similar files
 .nfs*
 *.log

diff --git a/medcat/cat.py b/medcat/cat.py
@@ -32,6 +32,7 @@
 from medcat.linking.context_based_linker import Linker
 from medcat.preprocessing.cleaners import prepare_name
 from medcat.meta_cat import MetaCAT
+from medcat.rel_cat import RelCAT
 from medcat.utils.meta_cat.data_utils import json_to_fake_spacy
 from medcat.config import Config
 from medcat.vocab import Vocab
@@ -88,6 +89,7 @@ def __init__(self,
                  vocab: Union[Vocab, None] = None,
                  config: Optional[Config] = None,
                  meta_cats: List[MetaCAT] = [],
+                 rel_cats: List[RelCAT] = [],
                  addl_ner: Union[TransformersNER, List[TransformersNER]] = []) -> None:
         self.cdb = cdb
         self.vocab = vocab
@@ -99,6 +101,7 @@ def __init__(self,
             self.config = config
             self.cdb.config = config
         self._meta_cats = meta_cats
+        self._rel_cats = rel_cats
         self._addl_ner = addl_ner if isinstance(addl_ner, list) else [addl_ner]
         self._create_pipeline(self.config)
 
@@ -132,6 +135,9 @@ def _create_pipeline(self, config: Config):
         for meta_cat in self._meta_cats:
             self.pipe.add_meta_cat(meta_cat, meta_cat.config.general.category_name)
 
+        for rel_cat in self._rel_cats:
+            self.pipe.add_rel_cat(rel_cat, "_".join(list(rel_cat.config.general["labels2idx"].keys())))
+
         # Set max document length
         self.pipe.spacy_nlp.max_length = config.preprocessing.max_document_length
 
@@ -292,6 +298,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
                 name = comp[0]
                 meta_path = os.path.join(save_dir_path, "meta_" + name)
                 comp[1].save(meta_path)
+            if isinstance(comp[1], RelCAT):
+                name = comp[0]
+                rel_path = os.path.join(save_dir_path, "rel_" + name)
+                comp[1].save(rel_path)
 
         # Add a model card also, why not
         model_card_path = os.path.join(save_dir_path, "model_card.json")
@@ -336,7 +346,8 @@ def load_model_pack(cls,
                         meta_cat_config_dict: Optional[Dict] = None,
                         ner_config_dict: Optional[Dict] = None,
                         load_meta_models: bool = True,
-                        load_addl_ner: bool = True) -> "CAT":
+                        load_addl_ner: bool = True,
+                        load_rel_models: bool = True) -> "CAT":
         """Load everything within the 'model pack', i.e. the CDB, config, vocab and any MetaCAT models
         (if present)
 
@@ -359,6 +370,7 @@ def load_model_pack(cls,
         from medcat.cdb import CDB
         from medcat.vocab import Vocab
         from medcat.meta_cat import MetaCAT
+        from medcat.rel_cat import RelCAT
 
         model_pack_path = cls.attempt_unpack(zip_path)
 
@@ -401,8 +413,15 @@ def load_model_pack(cls,
             meta_cats.append(MetaCAT.load(save_dir_path=meta_path,
                                           config_dict=meta_cat_config_dict))
 
-        cat = cls(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=meta_cats, addl_ner=addl_ner)
+        # Find Rel models in model_pack
+        rel_paths = [os.path.join(model_pack_path, path) for path in os.listdir(model_pack_path) if path.startswith('rel_')] if load_rel_models else []
+        rel_cats = []
+        for rel_path in rel_paths:
+            rel_cats.append(RelCAT.load(load_path=rel_path))
+
+        cat = cls(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=meta_cats, addl_ner=addl_ner, rel_cats=rel_cats)
         logger.info(cat.get_model_card())  # Print the model card
+
         return cat
 
     def __call__(self, text: Optional[str], do_train: bool = False) -> Optional[Doc]:
@@ -1053,8 +1072,8 @@ def get_entities_multi_texts(self,
                         elif out[i].get('text', '') != text:
                             out.insert(i, self._doc_to_out(None, only_cui, addl_info))  # type: ignore
 
-                cnf_annotation_output = self.config.annotation_output
-                if not cnf_annotation_output.include_text_in_output:
+                cnf_annotation_output = getattr(self.config, 'annotation_output', {})
+                if not (cnf_annotation_output.get('include_text_in_output', False)):
                     for o in out:
                         if o is not None:
                             o.pop('text', None)

diff --git a/medcat/config_rel_cat.py b/medcat/config_rel_cat.py
@@ -0,0 +1,92 @@
+import logging
+from typing import Dict, Any, List
+from medcat.config import MixingConfig, BaseModel, Optional, Extra
+
+
+class General(MixingConfig, BaseModel):
+    """The General part of the RelCAT config"""
+    device: str = "cpu"
+    relation_type_filter_pairs: List = []
+    """Map from category values to ID, if empty it will be autocalculated during training"""
+    vocab_size: Optional[int] = None
+    lowercase: bool = True
+    """If true all input text will be lowercased"""
+    cntx_left: int = 15
+    """Number of tokens to take from the left of the concept"""
+    cntx_right: int = 15
+    """Number of tokens to take from the right of the concept"""
+    window_size: int = 300
+    """Max acceptable dinstance between entities (in characters), care when using this as it can produce sentences that are over 512 tokens (limit is given by tokenizer)"""
+    tokenizer_name: str = "bert"
+    model_name: str = "bert-base-uncased"
+    log_level: int = logging.INFO
+    max_seq_length: int = 512
+    tokenizer_special_tokens: bool = False
+    annotation_schema_tag_ids: List = []
+    """If a foreign non-MCAT trainer dataset is used, you can insert your own Rel entity token delimiters into the tokenizer, \
+    copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce"""
+    labels2idx: Dict = {}
+    idx2labels: Dict = {}
+    pin_memory: bool = True
+    seed: int = 13
+    task: str = "train"
+
+
+class Model(MixingConfig, BaseModel):
+    """The model part of the RelCAT config"""
+    input_size: int = 300
+    hidden_size: int = 768
+    hidden_layers: int = 3
+    """ hidden_size * 5, 5 being the number of tokens, default (s1,s2,e1,e2+CLS)"""
+    model_size: int = 5120
+    dropout: float = 0.2
+    num_directions: int = 2
+    """2 - bidirectional model, 1 - unidirectional"""
+
+    padding_idx: int = -1
+    emb_grad: bool = True
+    """If True the embeddings will also be trained"""
+    ignore_cpos: bool = False
+    """If set to True center positions will be ignored when calculating represenation"""
+
+    class Config:
+        extra = Extra.allow
+        validate_assignment = True
+
+
+class Train(MixingConfig, BaseModel):
+    """The train part of the RelCAT config"""
+    nclasses: int = 2
+    """Number of classes that this model will output"""
+    batch_size: int = 25
+    nepochs: int = 1
+    lr: float = 1e-4
+    adam_epsilon: float = 1e-4
+    test_size: float = 0.2
+    gradient_acc_steps: int = 1
+    multistep_milestones: List[int] = [
+        2, 4, 6, 8, 12, 15, 18, 20, 22, 24, 26, 30]
+    multistep_lr_gamma: float = 0.8
+    max_grad_norm: float = 1.0
+    shuffle_data: bool = True
+    """Used only during training, if set the dataset will be shuffled before train/test split"""
+    class_weights: Optional[Any] = None
+    score_average: str = "weighted"
+    """What to use for averaging F1/P/R across labels"""
+    auto_save_model: bool = True
+    """Should the model be saved during training for best results"""
+
+    class Config:
+        extra = Extra.allow
+        validate_assignment = True
+
+
+class ConfigRelCAT(MixingConfig, BaseModel):
+    """The RelCAT part of the config"""
+    general: General = General()
+    model: Model = Model()
+    train: Train = Train()
+
+    class Config:
+        extra = Extra.allow
+        validate_assignment = True
diff --git a/medcat/pipe.py b/medcat/pipe.py
@@ -13,6 +13,7 @@
 from medcat.linking.context_based_linker import Linker
 from medcat.meta_cat import MetaCAT
 from medcat.ner.vocab_based_ner import NER
+from medcat.rel_cat import RelCAT
 from medcat.utils.normalizers import TokenNormalizer, BasicSpellChecker
 from medcat.config import Config
 from medcat.pipeline.pipe_runner import PipeRunner
@@ -161,6 +162,13 @@ def add_meta_cat(self, meta_cat: MetaCAT, name: Optional[str] = None) -> None:
         # Used for sharing pre-processed data/tokens
         Doc.set_extension('share_tokens', default=None, force=True)
 
+    def add_rel_cat(self, rel_cat: RelCAT, name: Optional[str] = None) -> None:
+        component_name = spacy.util.get_object_name(rel_cat)
+        name = name if name is not None else component_name
+        Language.component(name=component_name, func=rel_cat)
+        self._nlp.add_pipe(component_name, name=name, last=True)
+        # dictionary containing relations of the form {}
+        Doc.set_extension("relations", default=[], force=True)
 
     def add_addl_ner(self, addl_ner: TransformersNER, name: Optional[str] = None) -> None:
         component_name = spacy.util.get_object_name(addl_ner)
@@ -169,6 +177,7 @@ def add_addl_ner(self, addl_ner: TransformersNER, name: Optional[str] = None) ->
         self._nlp.add_pipe(component_name, name=name, last=True)
 
         Doc.set_extension('ents', default=[], force=True)
+        Doc.set_extension('relations', default=[], force=True)
         Span.set_extension('confidence', default=-1, force=True)
         Span.set_extension('id', default=0, force=True)
         Span.set_extension('cui', default=-1, force=True)