CU-8694fwyje: Update all configs with pre-load parts documented (#473)

CogStack · Aug 12, 2024 · c82ad4b · c82ad4b
1 parent 76c2fa2
commit c82ad4b
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 10 deletions.
diff --git a/medcat/config.py b/medcat/config.py
@@ -350,6 +350,9 @@ class General(MixingConfig, BaseModel):
     spacy_disabled_components: list = ['ner', 'parser', 'vectors', 'textcat',
                                        'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks',
                                        'merge_entities', 'merge_subtokens']
+    """The list of spacy components that will be disabled.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     checkpoint: CheckPoint = CheckPoint()
     usage_monitor = UsageMonitor()
     """Checkpointing config"""
@@ -412,9 +415,13 @@ class Preprocessing(MixingConfig, BaseModel):
     min_len_normalize: int = 5
     """Nothing below this length will ever be normalized (input tokens or concept names), normalized means lemmatized in this case"""
     stopwords: Optional[set] = None
-    """If None the default set of stowords from spacy will be used. This must be a Set."""
+    """If None the default set of stowords from spacy will be used. This must be a Set.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     max_document_length: int = 1000000
-    """Documents longer  than this will be trimmed"""
+    """Documents longer  than this will be trimmed.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
 
     class Config:
         extra = Extra.allow

diff --git a/medcat/config_meta_cat.py b/medcat/config_meta_cat.py
@@ -17,10 +17,16 @@ class General(MixingConfig, BaseModel):
 
     If set to True, the component lock is not used."""
     seed: int = 13
+    """The seed for random number generation.
+
+    NOTE: If used along RelCAT or additional NER, only one of the seeds will take effect
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     description: str = "No description"
     """Should provide a basic description of this MetaCAT model"""
     category_name: Optional[str] = None
-    """What category is this meta_cat model predicting/training"""
+    """What category is this meta_cat model predicting/training.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     category_value2id: Dict = {}
     """Map from category values to ID, if empty it will be autocalculated during training"""
     vocab_size: Optional[int] = None
@@ -44,6 +50,8 @@ class General(MixingConfig, BaseModel):
     Choose from:
         - 'bbpe': Byte Pair Encoding Tokenizer
         - 'bert-tokenizer': BERT Tokenizer
+
+    NB! For these changes to take effect, the pipe would need to be recreated.
     """
     save_and_reuse_tokens: bool = False
     """This is a dangerous option, if not sure ALWAYS set to False. If set, it will try to share the pre-calculated
@@ -73,12 +81,15 @@ class Model(MixingConfig, BaseModel):
 
     Note:
         When changing the model, make sure to change the tokenizer accordingly.
+        NB! For these changes to take effect, the pipe would need to be recreated.
     """
     model_variant: str = 'bert-base-uncased'
     """
     Applicable only when using BERT:
 
     Specifies the model variant to be used.
+
+    NB! For these changes to take effect, the pipe would need to be recreated.
     """
     model_freeze_layers: bool = True
     """
@@ -88,18 +99,29 @@ class Model(MixingConfig, BaseModel):
 
     - If True: BERT layers are frozen and only the fully connected (FC) layer(s) on top are trained.
     - If False: Parameter-efficient fine-tuning will be applied using Low-Rank Adaptation (LoRA).
+
+    NB! For these changes to take effect, the pipe would need to be recreated.
     """
     num_layers: int = 2
-    """Number of layers in the model (both LSTM and BERT)"""
+    """Number of layers in the model (both LSTM and BERT)
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     input_size: int = 300
     """
     Specifies the size of the embedding layer.
 
     Applicable only for LSTM model and ignored for BERT as BERT's embedding size is predefined.
+
+    NB! For these changes to take effect, the pipe would need to be recreated.
     """
     hidden_size: int = 300
-    """Number of neurons in the hidden layer"""
+    """Number of neurons in the hidden layer.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     dropout: float = 0.5
+    """The dropout for the model.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     phase_number: int = 0
     """Indicates whether two phase learning is to be used for training.
 
@@ -119,19 +141,30 @@ class Model(MixingConfig, BaseModel):
 
     If fc2 is True and fc3 is set to True, then the 3rd fully connected layer is used
 
-    If lr_scheduler is set to True, then the learning rate scheduler is used with the optimizer    
+    If lr_scheduler is set to True, then the learning rate scheduler is used with the optimizer
+
+    NB! For these changes to take effect, the pipe would need to be recreated.
     """
     num_directions: int = 2
     """Applicable only for LSTM:
 
-    2 - bidirectional model, 1 - unidirectional"""
+    2 - bidirectional model, 1 - unidirectional
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     nclasses: int = 2
-    """Number of classes that this model will output"""
+    """Number of classes that this model will output.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     padding_idx: int = -1
+    """The padding index.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     emb_grad: bool = True
     """Applicable only for LSTM:
 
-    If True, the embeddings will also be trained"""
+    If True, the embeddings will also be trained.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     ignore_cpos: bool = False
     """If set to True center positions will be ignored when calculating representation"""
 

diff --git a/medcat/config_rel_cat.py b/medcat/config_rel_cat.py
@@ -6,6 +6,9 @@
 class General(MixingConfig, BaseModel):
     """The General part of the RelCAT config"""
     device: str = "cpu"
+    """The device to use (CPU or GPU).
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     relation_type_filter_pairs: List = []
     """Map from category values to ID, if empty it will be autocalculated during training"""
     vocab_size: Optional[int] = None
@@ -24,27 +27,57 @@ class General(MixingConfig, BaseModel):
     """When processing relations from a MedCAT export, relations labeled as 'Other' are created from all the annotations pairs available"""
 
     tokenizer_name: str = "bert"
+    """The name of the tokenizer user.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     model_name: str = "bert-base-uncased"
+    """The name of the model used.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     log_level: int = logging.INFO
+    """The log level for RelCAT.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     max_seq_length: int = 512
+    """The maximum sequence length.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     tokenizer_special_tokens: bool = False
+    """Tokenizer.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     annotation_schema_tag_ids: List = []
     """If a foreign non-MCAT trainer dataset is used, you can insert your own Rel entity token delimiters into the tokenizer, \
     copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce"""
     labels2idx: Dict = {}
     idx2labels: Dict = {}
     pin_memory: bool = True
     seed: int = 13
+    """The seed for random number generation.
+
+    NOTE: If used along MetaCAT or additional NER, only one of the seeds will take effect
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     task: str = "train"
+    """The task for RelCAT.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
 
 
 class Model(MixingConfig, BaseModel):
     """The model part of the RelCAT config"""
     input_size: int = 300
     hidden_size: int = 768
+    """The hidden size.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     hidden_layers: int = 3
-    """ hidden_size * 5, 5 being the number of tokens, default (s1,s2,e1,e2+CLS)"""
+    """ hidden_size * 5, 5 being the number of tokens, default (s1,s2,e1,e2+CLS).
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     model_size: int = 5120
+    """The size of the model.
+
+    NB! For these changes to take effect, the pipe would need to be recreated."""
     dropout: float = 0.2
     num_directions: int = 2
     """2 - bidirectional model, 1 - unidirectional"""