Relation extraction (#173)

* Added files. * More additions to rel extraction. * Rel base. * Update. * Updates. * Dependency parsing. * Updates. * Added pre-training steps. * Added training & model utils. * Cleanup & fixes. * Update. * Evaluation updates for pretraining. * Removed duplicate relation storage. * Moved RE model file location. * Structure revisions. * Added custom config for RE. * Implemented custom dataset loader for RE. * More changes. * Small fix. * Latest additions to RelCAT (pipe + predictions) * Setup.py fix. * RE utils update. * rel model update. * rel dataset + tokenizer improvements. * RelCAT updates. * RelCAT saving/loading improvements. * RelCAT saving/loading improvements. * RelCAT model fixes. * Attempted gpu learning fix. Dataset label generation fixes. * Minor train dataset gen fix. * Minor train dataset gen fix No.2. * Config updates. * Gpu support fixes. Added label stats. * Evaluation stat fixes. * Cleaned stat output mode during training. * Build fix. * removed unused dependencies and fixed code formatting * Mypy compliance. * Fixed linting. * More Gpu mode train fixes. * Fixed model saving/loading issues when using other baes models. * More fixes to stat evaluation. Added proper CAT integration of RelCAT. * Setup.py typo fix. * RelCAT loading fix. * RelCAT Config changes. * Type fix. Minor additions to RelCAT model. * Type fixes. * Type corrections. * RelCAT update. * Type fixes. * Fixed type issue. * RelCATConfig: added seed param. * Adaptations to the new codebase + type fixes.. * Doc/type fixes. * Fixed input size issue for model. * Fixed issue(s) with model size and config. * RelCAT: updated configs to new style. * RelCAT: removed old refs to logging. * Fixed GPU training + added extra stat print for train set. * Type fixes. * Updated dev requirements. * Linting. * Fixed pin_memory issue when training on CPU. * Updated RelCAT dataset get + default config. * Updated RelDS generator + default config * Linting. * Updated RelDatset + config. * Pushing updates to model Made changes to: 1) Extracting given number of context tokens left and right of the entities 2) Extracting hidden state from bert for all the tokens of the entities and performing max pooling on them * Fixing formatting * Update rel_dataset.py * Update rel_dataset.py * Update rel_dataset.py * RelCAT: added test resource files. * RelCAT: Fixed model load/checkpointing. * RelCAT: updated to pipe spacy doc call. * RelCAT: added tests. * Fixed lint/type issues & added rel tag to test DS. * Fixed ann id to token issue. * RelCAT: updated test dataset + tests. * RelCAT: updates to requested changes + dataset improvements. * RelCAT: updated docs/logs according to commends. * RelCAT: type fix. * RelCAT: mct export dataset updates. * RelCAT: test updates + requested changes p2. * RelCAT: log for MCT export train. * Updated docs + split train_test & dataset for benchmarks. * type fixes. --------- Co-authored-by: Shubham Agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Co-authored-by: mart-r <mart.ratas@gmail.com>
CogStack · May 1, 2024 · abc97fb · abc97fb
1 parent 1caa187
commit abc97fb
Show file tree

Hide file tree

Showing 17 changed files with 6,776 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,9 @@ venv
 db.sqlite3
 .ipynb_checkpoints
 
+# vscode 
+.vscode
+
 #tmp and similar files
 .nfs*
 *.log

diff --git a/medcat/cat.py b/medcat/cat.py
@@ -33,6 +33,7 @@
 from medcat.linking.context_based_linker import Linker
 from medcat.preprocessing.cleaners import prepare_name
 from medcat.meta_cat import MetaCAT
+from medcat.rel_cat import RelCAT
 from medcat.utils.meta_cat.data_utils import json_to_fake_spacy
 from medcat.config import Config
 from medcat.vocab import Vocab
@@ -64,6 +65,8 @@ class CAT(object):
         meta_cats (list of medcat.meta_cat.MetaCAT, optional):
             A list of models that will be applied sequentially on each
             detected annotation.
+        rel_cats (list of medcat.rel_cat.RelCAT, optional)
+            List of models applied sequentially on all detected annotations.
 
     Attributes (limited):
         cdb (medcat.cdb.CDB):
@@ -89,6 +92,7 @@ def __init__(self,
                  vocab: Union[Vocab, None] = None,
                  config: Optional[Config] = None,
                  meta_cats: List[MetaCAT] = [],
+                 rel_cats: List[RelCAT] = [],
                  addl_ner: Union[TransformersNER, List[TransformersNER]] = []) -> None:
         self.cdb = cdb
         self.vocab = vocab
@@ -100,6 +104,7 @@ def __init__(self,
             self.config = config
             self.cdb.config = config
         self._meta_cats = meta_cats
+        self._rel_cats = rel_cats
         self._addl_ner = addl_ner if isinstance(addl_ner, list) else [addl_ner]
         self._create_pipeline(self.config)
 
@@ -133,6 +138,9 @@ def _create_pipeline(self, config: Config):
         for meta_cat in self._meta_cats:
             self.pipe.add_meta_cat(meta_cat, meta_cat.config.general.category_name)
 
+        for rel_cat in self._rel_cats:
+            self.pipe.add_rel_cat(rel_cat, "_".join(list(rel_cat.config.general["labels2idx"].keys())))
+
         # Set max document length
         self.pipe.spacy_nlp.max_length = config.preprocessing.max_document_length
 
@@ -297,6 +305,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
                 name = comp[0]
                 meta_path = os.path.join(save_dir_path, "meta_" + name)
                 comp[1].save(meta_path)
+            if isinstance(comp[1], RelCAT):
+                name = comp[0]
+                rel_path = os.path.join(save_dir_path, "rel_" + name)
+                comp[1].save(rel_path)
 
         # Add a model card also, why not
         model_card_path = os.path.join(save_dir_path, "model_card.json")
@@ -341,7 +353,8 @@ def load_model_pack(cls,
                         meta_cat_config_dict: Optional[Dict] = None,
                         ner_config_dict: Optional[Dict] = None,
                         load_meta_models: bool = True,
-                        load_addl_ner: bool = True) -> "CAT":
+                        load_addl_ner: bool = True,
+                        load_rel_models: bool = True) -> "CAT":
         """Load everything within the 'model pack', i.e. the CDB, config, vocab and any MetaCAT models
         (if present)
 
@@ -360,13 +373,16 @@ def load_model_pack(cls,
                 Whether to load MetaCAT models if present (Default value True).
             load_addl_ner (bool):
                 Whether to load additional NER models if present (Default value True).
+            load_rel_models (bool):
+                Whether to load RelCAT models if present (Default value True).
 
         Returns:
             CAT: The resulting CAT object.
         """
         from medcat.cdb import CDB
         from medcat.vocab import Vocab
         from medcat.meta_cat import MetaCAT
+        from medcat.rel_cat import RelCAT
 
         model_pack_path = cls.attempt_unpack(zip_path)
 
@@ -409,8 +425,15 @@ def load_model_pack(cls,
             meta_cats.append(MetaCAT.load(save_dir_path=meta_path,
                                           config_dict=meta_cat_config_dict))
 
-        cat = cls(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=meta_cats, addl_ner=addl_ner)
+        # Find Rel models in model_pack
+        rel_paths = [os.path.join(model_pack_path, path) for path in os.listdir(model_pack_path) if path.startswith('rel_')] if load_rel_models else []
+        rel_cats = []
+        for rel_path in rel_paths:
+            rel_cats.append(RelCAT.load(load_path=rel_path))
+
+        cat = cls(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=meta_cats, addl_ner=addl_ner, rel_cats=rel_cats)
         logger.info(cat.get_model_card())  # Print the model card
+
         return cat
 
     def __call__(self, text: Optional[str], do_train: bool = False) -> Optional[Doc]:
@@ -1092,8 +1115,8 @@ def get_entities_multi_texts(self,
                         elif out[i].get('text', '') != text:
                             out.insert(i, self._doc_to_out(None, only_cui, addl_info))  # type: ignore
 
-                cnf_annotation_output = self.config.annotation_output
-                if not cnf_annotation_output.include_text_in_output:
+                cnf_annotation_output = getattr(self.config, 'annotation_output', {})
+                if not (cnf_annotation_output.get('include_text_in_output', False)):
                     for o in out:
                         if o is not None:
                             o.pop('text', None)

diff --git a/medcat/config_rel_cat.py b/medcat/config_rel_cat.py
@@ -0,0 +1,98 @@
+import logging
+from typing import Dict, Any, List
+from medcat.config import MixingConfig, BaseModel, Optional, Extra
+
+
+class General(MixingConfig, BaseModel):
+    """The General part of the RelCAT config"""
+    device: str = "cpu"
+    relation_type_filter_pairs: List = []
+    """Map from category values to ID, if empty it will be autocalculated during training"""
+    vocab_size: Optional[int] = None
+    lowercase: bool = True
+    """If true all input text will be lowercased"""
+    cntx_left: int = 15
+    """Number of tokens to take from the left of the concept"""
+    cntx_right: int = 15
+    """Number of tokens to take from the right of the concept"""
+    window_size: int = 300
+    """Max acceptable dinstance between entities (in characters), care when using this as it can produce sentences that are over 512 tokens (limit is given by tokenizer)"""
+
+    mct_export_max_non_rel_sample_size:int = 200
+    """Limit the number of 'Other' samples selected for training/test. This is applied per encountered medcat project, sample_size/num_projects. """
+    mct_export_create_addl_rels: bool = False
+    """When processing relations from a MedCAT export, relations labeled as 'Other' are created from all the annotations pairs available"""
+
+    tokenizer_name: str = "bert"
+    model_name: str = "bert-base-uncased"
+    log_level: int = logging.INFO
+    max_seq_length: int = 512
+    tokenizer_special_tokens: bool = False
+    annotation_schema_tag_ids: List = []
+    """If a foreign non-MCAT trainer dataset is used, you can insert your own Rel entity token delimiters into the tokenizer, \
+    copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce"""
+    labels2idx: Dict = {}
+    idx2labels: Dict = {}
+    pin_memory: bool = True
+    seed: int = 13
+    task: str = "train"
+
+
+class Model(MixingConfig, BaseModel):
+    """The model part of the RelCAT config"""
+    input_size: int = 300
+    hidden_size: int = 768
+    hidden_layers: int = 3
+    """ hidden_size * 5, 5 being the number of tokens, default (s1,s2,e1,e2+CLS)"""
+    model_size: int = 5120
+    dropout: float = 0.2
+    num_directions: int = 2
+    """2 - bidirectional model, 1 - unidirectional"""
+
+    padding_idx: int = -1
+    emb_grad: bool = True
+    """If True the embeddings will also be trained"""
+    ignore_cpos: bool = False
+    """If set to True center positions will be ignored when calculating represenation"""
+
+    class Config:
+        extra = Extra.allow
+        validate_assignment = True
+
+
+class Train(MixingConfig, BaseModel):
+    """The train part of the RelCAT config"""
+    nclasses: int = 2
+    """Number of classes that this model will output"""
+    batch_size: int = 25
+    nepochs: int = 1
+    lr: float = 1e-4
+    adam_epsilon: float = 1e-4
+    test_size: float = 0.2
+    gradient_acc_steps: int = 1
+    multistep_milestones: List[int] = [
+        2, 4, 6, 8, 12, 15, 18, 20, 22, 24, 26, 30]
+    multistep_lr_gamma: float = 0.8
+    max_grad_norm: float = 1.0
+    shuffle_data: bool = True
+    """Used only during training, if set the dataset will be shuffled before train/test split"""
+    class_weights: Optional[Any] = None
+    score_average: str = "weighted"
+    """What to use for averaging F1/P/R across labels"""
+    auto_save_model: bool = True
+    """Should the model be saved during training for best results"""
+
+    class Config:
+        extra = Extra.allow
+        validate_assignment = True
+
+
+class ConfigRelCAT(MixingConfig, BaseModel):
+    """The RelCAT part of the config"""
+    general: General = General()
+    model: Model = Model()
+    train: Train = Train()
+
+    class Config:
+        extra = Extra.allow
+        validate_assignment = True
diff --git a/medcat/pipe.py b/medcat/pipe.py
@@ -13,6 +13,7 @@
 from medcat.linking.context_based_linker import Linker
 from medcat.meta_cat import MetaCAT
 from medcat.ner.vocab_based_ner import NER
+from medcat.rel_cat import RelCAT
 from medcat.utils.normalizers import TokenNormalizer, BasicSpellChecker
 from medcat.config import Config
 from medcat.pipeline.pipe_runner import PipeRunner
@@ -161,6 +162,13 @@ def add_meta_cat(self, meta_cat: MetaCAT, name: Optional[str] = None) -> None:
         # Used for sharing pre-processed data/tokens
         Doc.set_extension('share_tokens', default=None, force=True)
 
+    def add_rel_cat(self, rel_cat: RelCAT, name: Optional[str] = None) -> None:
+        component_name = spacy.util.get_object_name(rel_cat)
+        name = name if name is not None else component_name
+        Language.component(name=component_name, func=rel_cat)
+        self._nlp.add_pipe(component_name, name=name, last=True)
+        # dictionary containing relations of the form {}
+        Doc.set_extension("relations", default=[], force=True)
 
     def add_addl_ner(self, addl_ner: TransformersNER, name: Optional[str] = None) -> None:
         component_name = spacy.util.get_object_name(addl_ner)
@@ -169,6 +177,7 @@ def add_addl_ner(self, addl_ner: TransformersNER, name: Optional[str] = None) ->
         self._nlp.add_pipe(component_name, name=name, last=True)
 
         Doc.set_extension('ents', default=[], force=True)
+        Doc.set_extension('relations', default=[], force=True)
         Span.set_extension('confidence', default=-1, force=True)
         Span.set_extension('id', default=0, force=True)
         Span.set_extension('cui', default=-1, force=True)