Skip to content

Commit

Permalink
CU-8694fwyje: Update all configs with pre-load parts documented (#473)
Browse files Browse the repository at this point in the history
  • Loading branch information
mart-r authored Aug 12, 2024
1 parent 76c2fa2 commit c82ad4b
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 10 deletions.
11 changes: 9 additions & 2 deletions medcat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,9 @@ class General(MixingConfig, BaseModel):
spacy_disabled_components: list = ['ner', 'parser', 'vectors', 'textcat',
'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks',
'merge_entities', 'merge_subtokens']
"""The list of spacy components that will be disabled.
NB! For these changes to take effect, the pipe would need to be recreated."""
checkpoint: CheckPoint = CheckPoint()
usage_monitor = UsageMonitor()
"""Checkpointing config"""
Expand Down Expand Up @@ -412,9 +415,13 @@ class Preprocessing(MixingConfig, BaseModel):
min_len_normalize: int = 5
"""Nothing below this length will ever be normalized (input tokens or concept names), normalized means lemmatized in this case"""
stopwords: Optional[set] = None
"""If None the default set of stowords from spacy will be used. This must be a Set."""
"""If None the default set of stowords from spacy will be used. This must be a Set.
NB! For these changes to take effect, the pipe would need to be recreated."""
max_document_length: int = 1000000
"""Documents longer than this will be trimmed"""
"""Documents longer than this will be trimmed.
NB! For these changes to take effect, the pipe would need to be recreated."""

class Config:
extra = Extra.allow
Expand Down
47 changes: 40 additions & 7 deletions medcat/config_meta_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,16 @@ class General(MixingConfig, BaseModel):
If set to True, the component lock is not used."""
seed: int = 13
"""The seed for random number generation.
NOTE: If used along RelCAT or additional NER, only one of the seeds will take effect
NB! For these changes to take effect, the pipe would need to be recreated."""
description: str = "No description"
"""Should provide a basic description of this MetaCAT model"""
category_name: Optional[str] = None
"""What category is this meta_cat model predicting/training"""
"""What category is this meta_cat model predicting/training.
NB! For these changes to take effect, the pipe would need to be recreated."""
category_value2id: Dict = {}
"""Map from category values to ID, if empty it will be autocalculated during training"""
vocab_size: Optional[int] = None
Expand All @@ -44,6 +50,8 @@ class General(MixingConfig, BaseModel):
Choose from:
- 'bbpe': Byte Pair Encoding Tokenizer
- 'bert-tokenizer': BERT Tokenizer
NB! For these changes to take effect, the pipe would need to be recreated.
"""
save_and_reuse_tokens: bool = False
"""This is a dangerous option, if not sure ALWAYS set to False. If set, it will try to share the pre-calculated
Expand Down Expand Up @@ -73,12 +81,15 @@ class Model(MixingConfig, BaseModel):
Note:
When changing the model, make sure to change the tokenizer accordingly.
NB! For these changes to take effect, the pipe would need to be recreated.
"""
model_variant: str = 'bert-base-uncased'
"""
Applicable only when using BERT:
Specifies the model variant to be used.
NB! For these changes to take effect, the pipe would need to be recreated.
"""
model_freeze_layers: bool = True
"""
Expand All @@ -88,18 +99,29 @@ class Model(MixingConfig, BaseModel):
- If True: BERT layers are frozen and only the fully connected (FC) layer(s) on top are trained.
- If False: Parameter-efficient fine-tuning will be applied using Low-Rank Adaptation (LoRA).
NB! For these changes to take effect, the pipe would need to be recreated.
"""
num_layers: int = 2
"""Number of layers in the model (both LSTM and BERT)"""
"""Number of layers in the model (both LSTM and BERT)
NB! For these changes to take effect, the pipe would need to be recreated."""
input_size: int = 300
"""
Specifies the size of the embedding layer.
Applicable only for LSTM model and ignored for BERT as BERT's embedding size is predefined.
NB! For these changes to take effect, the pipe would need to be recreated.
"""
hidden_size: int = 300
"""Number of neurons in the hidden layer"""
"""Number of neurons in the hidden layer.
NB! For these changes to take effect, the pipe would need to be recreated."""
dropout: float = 0.5
"""The dropout for the model.
NB! For these changes to take effect, the pipe would need to be recreated."""
phase_number: int = 0
"""Indicates whether two phase learning is to be used for training.
Expand All @@ -119,19 +141,30 @@ class Model(MixingConfig, BaseModel):
If fc2 is True and fc3 is set to True, then the 3rd fully connected layer is used
If lr_scheduler is set to True, then the learning rate scheduler is used with the optimizer
If lr_scheduler is set to True, then the learning rate scheduler is used with the optimizer
NB! For these changes to take effect, the pipe would need to be recreated.
"""
num_directions: int = 2
"""Applicable only for LSTM:
2 - bidirectional model, 1 - unidirectional"""
2 - bidirectional model, 1 - unidirectional
NB! For these changes to take effect, the pipe would need to be recreated."""
nclasses: int = 2
"""Number of classes that this model will output"""
"""Number of classes that this model will output.
NB! For these changes to take effect, the pipe would need to be recreated."""
padding_idx: int = -1
"""The padding index.
NB! For these changes to take effect, the pipe would need to be recreated."""
emb_grad: bool = True
"""Applicable only for LSTM:
If True, the embeddings will also be trained"""
If True, the embeddings will also be trained.
NB! For these changes to take effect, the pipe would need to be recreated."""
ignore_cpos: bool = False
"""If set to True center positions will be ignored when calculating representation"""

Expand Down
35 changes: 34 additions & 1 deletion medcat/config_rel_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
class General(MixingConfig, BaseModel):
"""The General part of the RelCAT config"""
device: str = "cpu"
"""The device to use (CPU or GPU).
NB! For these changes to take effect, the pipe would need to be recreated."""
relation_type_filter_pairs: List = []
"""Map from category values to ID, if empty it will be autocalculated during training"""
vocab_size: Optional[int] = None
Expand All @@ -24,27 +27,57 @@ class General(MixingConfig, BaseModel):
"""When processing relations from a MedCAT export, relations labeled as 'Other' are created from all the annotations pairs available"""

tokenizer_name: str = "bert"
"""The name of the tokenizer user.
NB! For these changes to take effect, the pipe would need to be recreated."""
model_name: str = "bert-base-uncased"
"""The name of the model used.
NB! For these changes to take effect, the pipe would need to be recreated."""
log_level: int = logging.INFO
"""The log level for RelCAT.
NB! For these changes to take effect, the pipe would need to be recreated."""
max_seq_length: int = 512
"""The maximum sequence length.
NB! For these changes to take effect, the pipe would need to be recreated."""
tokenizer_special_tokens: bool = False
"""Tokenizer.
NB! For these changes to take effect, the pipe would need to be recreated."""
annotation_schema_tag_ids: List = []
"""If a foreign non-MCAT trainer dataset is used, you can insert your own Rel entity token delimiters into the tokenizer, \
copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce"""
labels2idx: Dict = {}
idx2labels: Dict = {}
pin_memory: bool = True
seed: int = 13
"""The seed for random number generation.
NOTE: If used along MetaCAT or additional NER, only one of the seeds will take effect
NB! For these changes to take effect, the pipe would need to be recreated."""
task: str = "train"
"""The task for RelCAT.
NB! For these changes to take effect, the pipe would need to be recreated."""


class Model(MixingConfig, BaseModel):
"""The model part of the RelCAT config"""
input_size: int = 300
hidden_size: int = 768
"""The hidden size.
NB! For these changes to take effect, the pipe would need to be recreated."""
hidden_layers: int = 3
""" hidden_size * 5, 5 being the number of tokens, default (s1,s2,e1,e2+CLS)"""
""" hidden_size * 5, 5 being the number of tokens, default (s1,s2,e1,e2+CLS).
NB! For these changes to take effect, the pipe would need to be recreated."""
model_size: int = 5120
"""The size of the model.
NB! For these changes to take effect, the pipe would need to be recreated."""
dropout: float = 0.2
num_directions: int = 2
"""2 - bidirectional model, 1 - unidirectional"""
Expand Down

0 comments on commit c82ad4b

Please sign in to comment.