Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-86948uv4g docstring signature consistency #413

Merged
merged 21 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
575e206
CU-86948uv4g: Add pydoctest to dev requirements
mart-r Apr 9, 2024
fd1e2c8
CU-86948uv4g: Run pydoctest during main workflow
mart-r Apr 9, 2024
33cbdaf
CU-86948uv4g: Fix docstrings for CDB (make sure signatures match doc …
mart-r Apr 9, 2024
7662a96
CU-86948uv4g: Fix docstrings for Vocab (make sure signatures match do…
mart-r Apr 9, 2024
62eb79e
CU-86948uv4g: Fix docstrings for cdb_maker (make sure signatures matc…
mart-r Apr 9, 2024
d6543e1
CU-86948uv4g: Fix docstrings for config (make sure signatures match d…
mart-r Apr 9, 2024
850ac82
CU-86948uv4g: Move to darglint
mart-r Apr 10, 2024
b2fce68
CU-86948uv4g: Remove unnecessary pydoctest fixes
mart-r Apr 10, 2024
14f8ef7
CU-86948uv4g: Fix docstrings for CAT (make sure signatures match doc …
mart-r Apr 10, 2024
70ed455
CU-86948uv4g: Fix docstrings for stats (make sure signatures match do…
mart-r Apr 10, 2024
87f6855
CU-86948uv4g: Fix docstrings for CDB aker (make sure signatures match…
mart-r Apr 10, 2024
6da354d
CU-86948uv4g: Fix docstrings for a few more modules (make sure signat…
mart-r Apr 10, 2024
53262a8
CU-86948uv4g: Fix docstrings for a few more modules (make sure signat…
mart-r Apr 10, 2024
5a8ae1c
CU-86948uv4g: Update flake8 config to support later versions of flake8
mart-r Apr 10, 2024
7621dea
CU-86948uv4g: Fix docstrings for a the rest of the modules (make sure…
mart-r Apr 11, 2024
f7487e3
CU-86948uv4g: Move away from darglint
mart-r Apr 11, 2024
e275d0d
CU-86948uv4g: Bump flake8 to 7.0.0 for documentation checks
mart-r Apr 11, 2024
52612f6
CU-86948uv4g: Fix typing issues
mart-r Apr 11, 2024
12530e2
Merge branch 'master' into CU-86948uv4g-docstring-signature-consitency
mart-r Apr 11, 2024
37364c8
Merge branch 'master' into CU-86948uv4g-docstring-signature-consitency
mart-r Apr 18, 2024
f672dec
CU-86948uv4g: Fix additional doc string issues from new things in master
mart-r Apr 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,18 +1,32 @@
[flake8]
extend-ignore =
E124, ; closing bracket does not match visual indentation
E127, ; continuation line over-indented for visual indent
E128, ; continuation line under-indented for visual indent
E221, ; multiple spaces before operator
E225, ; missing whitespace around operator
E231, ; missing whitespace after ',' and ':'
E252, ; missing whitespace around parameter equal
E261, ; at least two spaces before inline comment
E265, ; block comment should start with '# '
E272, ; multiple spaces before keyword
E303, ; too many blank lines
E501, ; line too long
W291, ; trailing whitespace
W605, ; invalid escape sequence
E124,
; closing bracket does not match visual indentation
E127,
; continuation line over-indented for visual indent
E128,
; continuation line under-indented for visual indent
E221,
; multiple spaces before operator
E225,
; missing whitespace around operator
E231,
; missing whitespace after ',' and ':'
E252,
; missing whitespace around parameter equal
E261,
; at least two spaces before inline comment
E265,
; block comment should start with '# '
E272,
; multiple spaces before keyword
E303,
; too many blank lines
E501,
; line too long
W291,
; trailing whitespace
W605,
; invalid escape sequence

per-file-ignores = __init__.py:F401
187 changes: 123 additions & 64 deletions medcat/cat.py

Large diffs are not rendered by default.

75 changes: 45 additions & 30 deletions medcat/cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,11 @@ def get_name(self, cui: str) -> str:
the longest name assigned to the concept.

Args:
cui
cui (str):
Concept ID or unique identifer in this database.

Returns:
str: The name of the concept.
"""
name = cui # In case we do not find anything it will just return the CUI

Expand All @@ -128,7 +132,7 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None:
(self.cui2count_train.get(cui, 0) + 1)
self.is_dirty = True

def remove_names(self, cui: str, names: Dict) -> None:
def remove_names(self, cui: str, names: Dict[str, Dict]) -> None:
"""Remove names from an existing concept - effect is this name will never again be used to link to this concept.
This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else.
Why? Because it is bothersome to remove it from everywhere, but
Expand Down Expand Up @@ -166,8 +170,10 @@ def remove_names(self, cui: str, names: Dict) -> None:

def remove_cui(self, cui: str) -> None:
"""This function takes a `CUI` as an argument and removes it from all the internal objects that reference it.

Args:
cui
cui (str):
Concept ID or unique identifer in this database.
"""
if cui in self.cui2names:
del self.cui2names[cui]
Expand Down Expand Up @@ -201,7 +207,7 @@ def remove_cui(self, cui: str) -> None:
self.name2count_train = {name: len(cuis) for name, cuis in self.name2cuis.items()}
self.is_dirty = True

def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: bool = False) -> None:
def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', full_build: bool = False) -> None:
"""Adds a name to an existing concept.

Args:
Expand All @@ -212,8 +218,8 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
Names for this concept, or the value that if found in free text can be linked to this concept.
Names is an dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
name_status (str):
One of `P`, `N`, `A`
full_build (bool)):
One of `P`, `N`, `A`.
full_build (bool):
If True the dictionary self.addl_info will also be populated, contains a lot of extra information
about concepts, but can be very memory consuming. This is not necessary
for normal functioning of MedCAT (Default value `False`).
Expand All @@ -228,8 +234,8 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
@deprecated("Use `cdb._add_concept` as this will be removed in a future release.")
def add_concept(self,
cui: str,
names: Dict,
ontologies: set,
names: Dict[str, Dict],
ontologies: Set[str],
name_status: str,
type_ids: Set[str],
description: str,
Expand Down Expand Up @@ -265,8 +271,8 @@ def add_concept(self,

def _add_concept(self,
cui: str,
names: Dict,
ontologies: set,
names: Dict[str, Dict],
ontologies: Set[str],
name_status: str,
type_ids: Set[str],
description: str,
Expand Down Expand Up @@ -294,6 +300,9 @@ def _add_concept(self,
If True the dictionary self.addl_info will also be populated, contains a lot of extra information
about concepts, but can be very memory consuming. This is not necessary
for normal functioning of MedCAT (Default Value `False`).

Raises:
ValueError: If there is no name info yet `names` dict is not empty.
"""
# Add CUI to the required dictionaries
if cui not in self.cui2names:
Expand Down Expand Up @@ -406,7 +415,7 @@ def add_addl_info(self, name: str, data: Dict, reset_existing: bool = False) ->
Args:
name (str):
What key should be used in the `addl_info` dictionary.
data (Dict[<whatever>]):
data (Dict):
What will be added as the value for the key `name`
reset_existing (bool):
Should old data be removed if it exists
Expand All @@ -425,18 +434,19 @@ def update_context_vector(self,
cui_count: int = 0) -> None:
"""Add the vector representation of a context for this CUI.

cui (str):
The concept in question.
vectors (Dict[str, numpy.ndarray]):
Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
context_type - is usually one of: ['long', 'medium', 'short']
negative (bool):
Is this negative context of positive (Default Value `False`).
lr (int):
If set it will override the base value from the config file.
cui_count (int):
The learning rate will be calculated based on the count for the provided CUI + cui_count.
Defaults to 0.
Args:
cui (str):
The concept in question.
vectors (Dict[str, np.ndarray]):
Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
context_type - is usually one of: ['long', 'medium', 'short']
negative (bool):
Is this negative context of positive (Default Value `False`).
lr (Optional[float]):
If set it will override the base value from the config file.
cui_count (int):
The learning rate will be calculated based on the count for the provided CUI + cui_count.
Defaults to 0.
"""
if cui not in self.cui2context_vectors:
self.cui2context_vectors[cui] = {}
Expand Down Expand Up @@ -565,6 +575,9 @@ def load(cls, path: str, json_path: Optional[str] = None, config_dict: Optional[
Path to the JSON serialized folder
config_dict:
A dictionary that will be used to overwrite existing fields in the config of this CDB

Returns:
CDB: The resulting concept database.
"""
ser = CDBSerializer(path, json_path)
cdb = ser.deserialize(CDB)
Expand All @@ -582,7 +595,7 @@ def import_training(self, cdb: "CDB", overwrite: bool = True) -> None:
IMPORTANT it will not import name maps (cui2names, name2cuis or anything else) only vectors.

Args:
cdb (medcat.cdb.CDB):
cdb (CDB):
Concept database from which to import training vectors
overwrite (bool):
If True all training data in the existing CDB will be overwritten, else
Expand Down Expand Up @@ -641,7 +654,7 @@ def populate_cui2snames(self, force: bool = True) -> None:
cui2names into cui2snames.

Args:
force (bool, optional): Whether to force the (re-)population. Defaults to True.
force (bool): Whether to force the (re-)population. Defaults to True.
"""
if not force and self.cui2snames:
return
Expand All @@ -664,8 +677,11 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None:
However, the memory optimisation can be performed again afterwards.

Args:
cuis_to_keep (List[str]):
cuis_to_keep (Union[List[str], Set[str]]):
CUIs that will be kept, the rest will be removed (not completely, look above).

Raises:
Exception: If no snames and subsetting is not possible.
"""

if not self.cui2snames:
Expand Down Expand Up @@ -756,7 +772,7 @@ def most_similar(self,
min_cnt: int = 0,
topn: int = 50,
force_build: bool = False) -> Dict:
"""Given a concept it will calculate what other concepts in this CDB have the most similar
r"""Given a concept it will calculate what other concepts in this CDB have the most similar
embedding.

Args:
Expand All @@ -776,10 +792,9 @@ def most_similar(self,
Do not use cached sim matrix (Default value False)

Returns:
results (Dict):
A dictionary with topn results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'type_name': <type_name>,
Dict:
A dictionary with top results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'type_name': <type_name>,
'type_id': <type_id>, 'cnt': <number of training examples the concept has seen>}, ...}

"""

if 'similarity' in self.addl_info:
Expand Down
27 changes: 15 additions & 12 deletions medcat/cdb_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import datetime
import logging
import re
from typing import Optional, List, Dict, Union
from typing import Optional, List, Dict, Union, Any

from medcat.pipe import Pipe
from medcat.cdb import CDB
Expand Down Expand Up @@ -64,7 +64,7 @@ def prepare_csvs(self,
escapechar: Optional[str] = None,
index_col: bool = False,
full_build: bool = False,
only_existing_cuis: bool = False, **kwargs) -> CDB:
only_existing_cuis: bool = False, **kwargs: Any) -> CDB:
r"""Compile one or multiple CSVs into a CDB.

Note: This class/method generally uses the same instance of the CDB.
Expand All @@ -76,30 +76,33 @@ def prepare_csvs(self,
Args:
csv_paths (Union[pd.DataFrame, List[str]]):
An array of paths to the csv files that should be processed. Can also be an array of pd.DataFrames
full_build (bool):
If False only the core portions of the CDB will be built (the ones required for
the functioning of MedCAT). If True, everything will be added to the CDB - this
usually includes concept descriptions, various forms of names etc (take care that
this option produces a much larger CDB) (Default value False).
sep (str):
If necessary a custom separator for the csv files (Default value ',').
encoding (str):
encoding (Optional[str]):
Encoding to be used for reading the CSV file (Default value `None`).
escapechar (str):
escapechar (Optional[str]):
Escape char for the CSV (Default value None).
index_col (bool):
Index column for pandas read_csv (Default value False).
only_existing_cuis bool):
full_build (bool):
If False only the core portions of the CDB will be built (the ones required for
the functioning of MedCAT). If True, everything will be added to the CDB - this
usually includes concept descriptions, various forms of names etc (take care that
this option produces a much larger CDB) (Default value False).
only_existing_cuis (bool):
If True no new CUIs will be added, but only linked names will be extended. Mainly used when
enriching names of a CDB (e.g. SNOMED with UMLS terms) (Default value `False`).
Returns:
medcat.cdb.CDB: CDB with the new concepts added.
kwargs (Any):
Will be passed to pandas for CSV reading

Note:
\*\*kwargs:
Will be passed to pandas for CSV reading
csv:
Examples of the CSV used to make the CDB can be found on [GitHub](link)

Returns:
CDB: CDB with the new concepts added.
"""

useful_columns = ['cui', 'name', 'ontologies', 'name_status', 'type_ids', 'description']
Expand Down
7 changes: 5 additions & 2 deletions medcat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,10 @@ def parse_config_file(self, path: str, extractor: ValueExtractor = _DEFAULT_EXTR

Args:
path(str): the path to the config file
extractor(ValueExtractor, optional): (Default value = _DEFAULT_EXTRACTOR)
extractor(ValueExtractor): (Default value = _DEFAULT_EXTRACTOR)

Raises:
ValueError: In case of unknown attribute.
"""
with open(path, 'r') as f:
for line in f:
Expand Down Expand Up @@ -233,7 +236,7 @@ def fields(self) -> Dict[str, ModelField]:
"""Get the fields associated with this config.

Returns:
Dict[str, Field]: The dictionary of the field names and fields
Dict[str, ModelField]: The dictionary of the field names and fields
"""
return cast(BaseModel, self).__fields__

Expand Down
4 changes: 2 additions & 2 deletions medcat/datasets/medcat_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _info(self):
)

def _split_generators(self, dl_manager): # noqa
"""Returns SplitGenerators."""
"""Returns SplitGenerators.""" # noqa
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
Expand All @@ -77,7 +77,7 @@ def _split_generators(self, dl_manager): # noqa
]

def _generate_examples(self, filepath):
"""This function returns the examples in the raw (text) form."""
"""This function returns the examples in the raw (text) form.""" # noqa
logging.info("generating examples from = %s", filepath)
with open(filepath, 'rb') as f:
docs = pickle.load(f)
Expand Down
4 changes: 2 additions & 2 deletions medcat/datasets/patient_concept_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _info(self):
)

def _split_generators(self, dl_manager): # noqa
"""Returns SplitGenerators."""
"""Returns SplitGenerators.""" # noqa
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
Expand All @@ -74,7 +74,7 @@ def _split_generators(self, dl_manager): # noqa
]

def _generate_examples(self, filepath):
"""This function returns the examples in the raw (text) form."""
"""This function returns the examples in the raw (text) form.""" # noqa
logging.info("generating examples from = %s", filepath)
with open(filepath, 'rb') as f:
pt2stream = pickle.load(f)
Expand Down
4 changes: 2 additions & 2 deletions medcat/datasets/transformers_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _info(self):
)

def _split_generators(self, dl_manager): # noqa
"""Returns SplitGenerators."""
"""Returns SplitGenerators.""" # noqa
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
Expand All @@ -73,7 +73,7 @@ def _split_generators(self, dl_manager): # noqa
),
]

def _generate_examples(self, filepaths):
def _generate_examples(self, filepaths): # noqa
cnt = 0
for filepath in filepaths:
logging.info("generating examples from = %s", filepath)
Expand Down
Loading
Loading