From 81ba0bfcd5f99376e7d3ecda53b2c25cbc078e50 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 22 Nov 2023 17:44:07 +0000 Subject: [PATCH 01/17] Initial commit for merge_cdb method --- medcat/cdb.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 44d4fd9dd..86d0ff810 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -802,3 +802,97 @@ def calculate_hash(self): self._hash = hasher.hexdigest() logger.info("Found new CDB hash: %s", self._hash) return self._hash + +@staticmethod +def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): + """Merge two CDB's together to produce a single CDB. + + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite (bool): + NYI: Do not merge certain dictionaries, and prioritise a cdb. + vector_import (Dict[str, Dict[str, np.array]]): + NYI: Vectors to import, using the same format as cui2context_vectors. + """ + # TODO: overwriting, vector import + config = cdb1.config.copy() + cdb = CDB(config) + + # names - copy cdb 1 as that is priority, and save computation time + cdb.name2cuis = cdb1.name2cuis.copy() + cdb.name2cuis2status = cdb1.name2cuis2status.copy() + cdb.name2count_train = cdb1.name2count_train.copy() + cdb.name_isupper = cdb1.name_isupper.copy() + for name in cdb2.name2cuis: + if name in cdb1.name2cuis: #if they exist in both cdbs + cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name + if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} + if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: # if name only exists in cdb 2 + cdb.name2cuis[name] = cdb2.name2cuis[name] + if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] + if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] + + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) + + # cui merging + cdb.cui2names = cdb1.cui2names.copy() + cdb.cui2snames = cdb1.cui2snames.copy() + cdb.cui2count_train = cdb1.cui2count_train.copy() + cdb.cui2info = cdb1.cui2info.copy() + cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() + cdb.cui2tags = cdb1.cui2tags.copy() + cdb.cui2type_ids = cdb1.cui2type_ids.copy() + cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() + + cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() + for cui in cdb2.cui2names: + if cui in cdb1.cui2names: + cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) + if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) + if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + # this is where cui2info would be + if cui in cdb1.cui2context_vectors: + contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short + norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) + weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] + for s in contexts: + if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] + elif s in cdb1.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] + else: + cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] + if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + # Nothing to do with prefered name, unless overwrite + else: + cdb.cui2names[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] + if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + + cdb.addl_info = cdb1.addl_info.copy() + for key in cdb2.addl_info: + if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param + cdb.addl_info[key] = cdb2.addl_info[key] + + # vocab, adding counts if they occur in both + cdb.vocab = cdb1.vocab.copy() + for word in cdb2.vocab: + if word in cdb.vocab: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] + + return cdb \ No newline at end of file From 379a0dbbebc8212cf5c6f5ed46076bd79c15cc5c Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 22 Nov 2023 23:47:28 +0000 Subject: [PATCH 02/17] Added indentation to make merge_cdb a class method --- medcat/cdb.py | 184 +++++++++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 86d0ff810..ec6f80319 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -803,96 +803,96 @@ def calculate_hash(self): logger.info("Found new CDB hash: %s", self._hash) return self._hash -@staticmethod -def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): - """Merge two CDB's together to produce a single CDB. - - Args: - cdb1 (medcat.cdb.CDB): - The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as - cui2preferred_name), this cdb values will be prioritised over cdb2. - cdb2 (medcat.cdb.CDB): - The second medcat cdb to merge. - overwrite (bool): - NYI: Do not merge certain dictionaries, and prioritise a cdb. - vector_import (Dict[str, Dict[str, np.array]]): - NYI: Vectors to import, using the same format as cui2context_vectors. - """ - # TODO: overwriting, vector import - config = cdb1.config.copy() - cdb = CDB(config) - - # names - copy cdb 1 as that is priority, and save computation time - cdb.name2cuis = cdb1.name2cuis.copy() - cdb.name2cuis2status = cdb1.name2cuis2status.copy() - cdb.name2count_train = cdb1.name2count_train.copy() - cdb.name_isupper = cdb1.name_isupper.copy() - for name in cdb2.name2cuis: - if name in cdb1.name2cuis: #if they exist in both cdbs - cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name - if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} - if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: # if name only exists in cdb 2 - cdb.name2cuis[name] = cdb2.name2cuis[name] - if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] - if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] - if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] - - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) - - # cui merging - cdb.cui2names = cdb1.cui2names.copy() - cdb.cui2snames = cdb1.cui2snames.copy() - cdb.cui2count_train = cdb1.cui2count_train.copy() - cdb.cui2info = cdb1.cui2info.copy() - cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() - cdb.cui2tags = cdb1.cui2tags.copy() - cdb.cui2type_ids = cdb1.cui2type_ids.copy() - cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() - - cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() - for cui in cdb2.cui2names: - if cui in cdb1.cui2names: - cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) - if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) - if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] - # this is where cui2info would be - if cui in cdb1.cui2context_vectors: - contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short - norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] - for s in contexts: - if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] - elif s in cdb1.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] - else: - cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] - if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) - # Nothing to do with prefered name, unless overwrite - else: - cdb.cui2names[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] - if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done - if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] - - cdb.addl_info = cdb1.addl_info.copy() - for key in cdb2.addl_info: - if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param - cdb.addl_info[key] = cdb2.addl_info[key] - - # vocab, adding counts if they occur in both - cdb.vocab = cdb1.vocab.copy() - for word in cdb2.vocab: - if word in cdb.vocab: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] + @staticmethod + def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): + """Merge two CDB's together to produce a single CDB. + + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite (bool): + NYI: Do not merge certain dictionaries, and prioritise a cdb. + vector_import (Dict[str, Dict[str, np.array]]): + NYI: Vectors to import, using the same format as cui2context_vectors. + """ + # TODO: overwriting, vector import + config = cdb1.config.copy() + cdb = CDB(config) + + # names - copy cdb 1 as that is priority, and save computation time + cdb.name2cuis = cdb1.name2cuis.copy() + cdb.name2cuis2status = cdb1.name2cuis2status.copy() + cdb.name2count_train = cdb1.name2count_train.copy() + cdb.name_isupper = cdb1.name_isupper.copy() + for name in cdb2.name2cuis: + if name in cdb1.name2cuis: #if they exist in both cdbs + cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name + if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} + if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: # if name only exists in cdb 2 + cdb.name2cuis[name] = cdb2.name2cuis[name] + if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] + if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] + + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) + + # cui merging + cdb.cui2names = cdb1.cui2names.copy() + cdb.cui2snames = cdb1.cui2snames.copy() + cdb.cui2count_train = cdb1.cui2count_train.copy() + cdb.cui2info = cdb1.cui2info.copy() + cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() + cdb.cui2tags = cdb1.cui2tags.copy() + cdb.cui2type_ids = cdb1.cui2type_ids.copy() + cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() + + cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() + for cui in cdb2.cui2names: + if cui in cdb1.cui2names: + cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) + if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) + if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + # this is where cui2info would be + if cui in cdb1.cui2context_vectors: + contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short + norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) + weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] + for s in contexts: + if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] + elif s in cdb1.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] + else: + cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] + if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + # Nothing to do with prefered name, unless overwrite + else: + cdb.cui2names[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] + if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + + cdb.addl_info = cdb1.addl_info.copy() + for key in cdb2.addl_info: + if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param + cdb.addl_info[key] = cdb2.addl_info[key] + + # vocab, adding counts if they occur in both + cdb.vocab = cdb1.vocab.copy() + for word in cdb2.vocab: + if word in cdb.vocab: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] - return cdb \ No newline at end of file + return cdb \ No newline at end of file From e64b2e0714ec558d2199f5cd9fe177c22561788b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 00:18:30 +0000 Subject: [PATCH 03/17] fixed syntax issues --- medcat/cdb.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index ec6f80319..6ddcfa5ab 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -804,7 +804,7 @@ def calculate_hash(self): return self._hash @staticmethod - def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): + def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict = {}): """Merge two CDB's together to produce a single CDB. Args: @@ -828,7 +828,7 @@ def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict cdb.name2count_train = cdb1.name2count_train.copy() cdb.name_isupper = cdb1.name_isupper.copy() for name in cdb2.name2cuis: - if name in cdb1.name2cuis: #if they exist in both cdbs + if name in cdb1.name2cuis: # if they exist in both cdbs cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason @@ -886,7 +886,7 @@ def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict for key in cdb2.addl_info: if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param cdb.addl_info[key] = cdb2.addl_info[key] - + # vocab, adding counts if they occur in both cdb.vocab = cdb1.vocab.copy() for word in cdb2.vocab: @@ -895,4 +895,5 @@ def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict else: cdb.vocab[word] = cdb2.vocab[word] - return cdb \ No newline at end of file + return cdb + \ No newline at end of file From eefb010f115f25da2d3b4259ba1383ce34906f06 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 00:25:18 +0000 Subject: [PATCH 04/17] more lint fixes --- medcat/cdb.py | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 6ddcfa5ab..19fe04d55 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -830,13 +830,18 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict for name in cdb2.name2cuis: if name in cdb1.name2cuis: # if they exist in both cdbs cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name - if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} - if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + if name in cdb1.name2cuis2status: + cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} + if name in cdb1.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason else: # if name only exists in cdb 2 cdb.name2cuis[name] = cdb2.name2cuis[name] - if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] - if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] - if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] + if name in cdb2.name2cuis2status: + cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name_isupper: + cdb.name_isupper[name] = cdb2.name_isupper[name] # snames cdb.snames = cdb1.snames.union(cdb2.snames) @@ -855,8 +860,10 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict for cui in cdb2.cui2names: if cui in cdb1.cui2names: cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) - if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) - if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + if cui in cdb1.cui2snames: + cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) + if cui in cdb1.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] # this is where cui2info would be if cui in cdb1.cui2context_vectors: contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short @@ -864,7 +871,7 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] for s in contexts: if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] + cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] elif s in cdb1.cui2context_vectors[cui]: cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] else: @@ -874,13 +881,20 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict # Nothing to do with prefered name, unless overwrite else: cdb.cui2names[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] - if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done - if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + if cui in cdb2.cui2snames: + cdb.cui2snames[cui] = cdb2.cui2snames[cui] + if cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: + cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + if cui in cdb2.cui2context_vectors: + cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: + cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: + cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + if cui in cdb2.cui2preferred_name: + cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] cdb.addl_info = cdb1.addl_info.copy() for key in cdb2.addl_info: @@ -896,4 +910,3 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict cdb.vocab[word] = cdb2.vocab[word] return cdb - \ No newline at end of file From ff48a2a8168c5216afe0ddb14ec6eae13ae6df78 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 00:29:31 +0000 Subject: [PATCH 05/17] more lint fixes --- medcat/cdb.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 19fe04d55..6580569f1 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -876,8 +876,10 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] else: cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] - if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + if cui in cdb1.cui2tags: + cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: + cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) # Nothing to do with prefered name, unless overwrite else: cdb.cui2names[cui] = cdb2.cui2names[cui] From f299677c12afed91a4f50ecb8848a708491c6429 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 19:15:33 +0000 Subject: [PATCH 06/17] bug fixes of merge_cdb --- medcat/cdb.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 6580569f1..368d7bc0e 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -804,7 +804,7 @@ def calculate_hash(self): return self._hash @staticmethod - def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict = {}): + def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[str, dict[str, np.array]] = {}): """Merge two CDB's together to produce a single CDB. Args: @@ -863,12 +863,14 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) if cui in cdb1.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + cdb1.cui2count_train[cui] # this is where cui2info would be if cui in cdb1.cui2context_vectors: - contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short + contexts = set(list(cdb1.cui2context_vectors[cui]) + list(cdb2.cui2context_vectors[cui].keys())) # xlong, long, medium, short norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] + print(cdb1.cui2count_train[cui]) + print(norm) + weights = [np.divide(cdb1.cui2count_train[cui], norm), np.divide(cdb2.cui2count_train[cui], norm)] for s in contexts: if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] From abb68b5994e73a219e999cb4544003ff5cd358d9 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 22:35:09 +0000 Subject: [PATCH 07/17] removed print statements --- medcat/cdb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 368d7bc0e..47a9d8e02 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -867,9 +867,7 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[ # this is where cui2info would be if cui in cdb1.cui2context_vectors: contexts = set(list(cdb1.cui2context_vectors[cui]) + list(cdb2.cui2context_vectors[cui].keys())) # xlong, long, medium, short - norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - print(cdb1.cui2count_train[cui]) - print(norm) + norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) weights = [np.divide(cdb1.cui2count_train[cui], norm), np.divide(cdb2.cui2count_train[cui], norm)] for s in contexts: if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: From 900439aca48bdfb018fa082b6508f418895092ab Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 29 Nov 2023 12:59:03 +0000 Subject: [PATCH 08/17] intermediate changes of merge_cdb and testing function --- medcat/cdb.py | 1 + tests/test_cdb.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 47a9d8e02..886809604 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -823,6 +823,7 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[ cdb = CDB(config) # names - copy cdb 1 as that is priority, and save computation time + # TODO: CHECK BENEFITS OF USING ADD_NAMES HERE cdb.name2cuis = cdb1.name2cuis.copy() cdb.name2cuis2status = cdb1.name2cuis2status.copy() cdb.name2count_train = cdb1.name2count_train.copy() diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 96425bc8c..8ec055a37 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -6,6 +6,7 @@ import numpy as np from medcat.config import Config from medcat.cdb_maker import CDBMaker +from medcat.cdb import CDB class CDBTests(unittest.TestCase): @@ -82,5 +83,31 @@ def test_cui2snames_population(self): with self.subTest(cui): self.assertIn(cui, self.undertest.cui2snames) + + def test_merge_cdb(self): + # generating CDBs + config = Config() + maker = CDBMaker(config) + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") + cdb1 = maker.prepare_csvs(csv_paths=[path]) + cdb2 = cdb1.copy() + + # generating vectors and setting up + zeroes = np.zeros(shape=(1,300)) + ones = np.ones(shape=(1,300)) + for i, cui in enumerate(cdb1.cui2names): + cdb1.cui2context_vectors[cui] = {"short" : zeroes} + cdb2.cui2context_vectors[cui] = {"short" : ones} + cdb1.cui2count_train[cui] = 1 + cdb2.cui2count_train[cui] = i + test_add = {"test": {'tokens': "test_token", 'snames': "test_sname", 'raw_name': "test_raw_name", "is_upper" : "P"}} + cdb1.add_names("C0006826", test_add) + + # merging + cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) + # tests + + + if __name__ == '__main__': unittest.main() From 6a820f03e627fee1f02a59a858a340de35bf41f3 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Mon, 11 Dec 2023 20:13:30 +0000 Subject: [PATCH 09/17] changes to merge_cdb and adding unit tests for method --- medcat/cdb.py | 143 +++++++++++++++++++++++----------------------- tests/test_cdb.py | 29 +++++++--- 2 files changed, 92 insertions(+), 80 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 886809604..075a8d611 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -7,6 +7,7 @@ import numpy as np from typing import Dict, Set, Optional, List, Union from functools import partial +from copy import deepcopy from medcat import __version__ from medcat.utils.hasher import Hasher @@ -804,8 +805,11 @@ def calculate_hash(self): return self._hash @staticmethod - def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[str, dict[str, np.array]] = {}): - """Merge two CDB's together to produce a single CDB. + def merge_cdb(cdb1: "CDB", + cdb2: "CDB", + overwrite_training: int = 0, + full_build: bool = False): + """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. Args: cdb1 (medcat.cdb.CDB): @@ -813,103 +817,98 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[ cui2preferred_name), this cdb values will be prioritised over cdb2. cdb2 (medcat.cdb.CDB): The second medcat cdb to merge. - overwrite (bool): - NYI: Do not merge certain dictionaries, and prioritise a cdb. - vector_import (Dict[str, Dict[str, np.array]]): - NYI: Vectors to import, using the same format as cui2context_vectors. + overwrite_training (int): + Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 + full_build (bool): + Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" """ - # TODO: overwriting, vector import - config = cdb1.config.copy() + config = deepcopy(cdb1.config) cdb = CDB(config) - # names - copy cdb 1 as that is priority, and save computation time - # TODO: CHECK BENEFITS OF USING ADD_NAMES HERE - cdb.name2cuis = cdb1.name2cuis.copy() - cdb.name2cuis2status = cdb1.name2cuis2status.copy() - cdb.name2count_train = cdb1.name2count_train.copy() - cdb.name_isupper = cdb1.name_isupper.copy() - for name in cdb2.name2cuis: - if name in cdb1.name2cuis: # if they exist in both cdbs - cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name - if name in cdb1.name2cuis2status: - cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} - if name in cdb1.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: # if name only exists in cdb 2 - cdb.name2cuis[name] = cdb2.name2cuis[name] - if name in cdb2.name2cuis2status: - cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] - if name in cdb2.name_isupper: - cdb.name_isupper[name] = cdb2.name_isupper[name] - - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) + # Copy CDB 1 - as all settings from CDB 1 will be carried over + cdb.cui2names = deepcopy(cdb1.cui2names) + cdb.cui2snames = deepcopy(cdb1.cui2snames) + cdb.cui2count_train = deepcopy(cdb1.cui2count_train) + cdb.cui2info = deepcopy(cdb1.cui2info) + cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) + cdb.cui2tags = deepcopy(cdb1.cui2tags) + cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) + cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) + cdb.name2cuis = deepcopy(cdb1.name2cuis) + cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) + cdb.name2count_train = deepcopy(cdb1.name2count_train) + cdb.name_isupper = deepcopy(cdb1.name_isupper) + if full_build: + cdb.addl_info = deepcopy(cdb1.addl_info) - # cui merging - cdb.cui2names = cdb1.cui2names.copy() - cdb.cui2snames = cdb1.cui2snames.copy() - cdb.cui2count_train = cdb1.cui2count_train.copy() - cdb.cui2info = cdb1.cui2info.copy() - cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() - cdb.cui2tags = cdb1.cui2tags.copy() - cdb.cui2type_ids = cdb1.cui2type_ids.copy() - cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() - - cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() + # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name for cui in cdb2.cui2names: + names = dict() + for name in cdb2.cui2names[cui]: + names[name] = {'snames' : cdb2.cui2snames.get(cui, set()), 'is_upper' : cdb2.name_isupper.get(name, False), 'tokens' : {}} + name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + ontologies = set() + description = '' + # For addl_info check cui2original_names as they MUST be added + if full_build and cui in cdb2.addl_info['cui2original_names']: + if 'cui2ontologies' in cdb2.addl_info: + ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) + if 'cui2description' in cdb2.addl_info: + description = cdb2.addl_info['cui2description'][cui] + cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=full_build) if cui in cdb1.cui2names: - cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) - if cui in cdb1.cui2snames: - cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) - if cui in cdb1.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + cdb1.cui2count_train[cui] - # this is where cui2info would be + if cui in cdb1.cui2count_train or cui in cdb2.cui2count_train: + if overwrite_training == 1 and cui in cdb1.cui2count_train[cui]: + cdb.cui2count_train[cui] = cdb1.cui2count_train[cui] + elif overwrite_training == 2 and cui in cdb2.cui2count_train[cui]: + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + else: + cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) if cui in cdb1.cui2context_vectors: - contexts = set(list(cdb1.cui2context_vectors[cui]) + list(cdb2.cui2context_vectors[cui].keys())) # xlong, long, medium, short - norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - weights = [np.divide(cdb1.cui2count_train[cui], norm), np.divide(cdb2.cui2count_train[cui], norm)] - for s in contexts: - if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] - elif s in cdb1.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] - else: - cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short + if overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]: + weights = [1, 0] + elif overwrite_training == 2 and cui in cdb2.cui2context_vectors[cui]: + weights = [0, 1] + else: + norm = cdb.cui2count_train[cui] + weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + for s in contexts: + cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) - # Nothing to do with prefered name, unless overwrite else: - cdb.cui2names[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2snames: - cdb.cui2snames[cui] = cdb2.cui2snames[cui] if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] if cui in cdb2.cui2info: - cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + cdb.cui2info[cui] = cdb2.cui2info[cui] if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - if cui in cdb2.cui2preferred_name: - cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + + for name in cdb2.name2cuis: + if name in cdb1.name2cuis: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] - cdb.addl_info = cdb1.addl_info.copy() - for key in cdb2.addl_info: - if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param - cdb.addl_info[key] = cdb2.addl_info[key] + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) # vocab, adding counts if they occur in both - cdb.vocab = cdb1.vocab.copy() + cdb.vocab = deepcopy(cdb1.vocab) for word in cdb2.vocab: if word in cdb.vocab: cdb.vocab[word] += cdb2.vocab[word] else: cdb.vocab[word] = cdb2.vocab[word] - + return cdb diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 8ec055a37..7177ed903 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -85,27 +85,40 @@ def test_cui2snames_population(self): def test_merge_cdb(self): - # generating CDBs + # generating cdbs - two maker are requested as they point to the same created CDB. config = Config() - maker = CDBMaker(config) + config.general["spacy_model"] = "en_core_web_md" + maker1 = CDBMaker(config) + maker2 = CDBMaker(config) # second maker is required as it will otherwise point to same object path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") - cdb1 = maker.prepare_csvs(csv_paths=[path]) - cdb2 = cdb1.copy() + cdb1 = maker1.prepare_csvs(csv_paths=[path]) + cdb2 = maker2.prepare_csvs(csv_paths=[path]) # generating vectors and setting up zeroes = np.zeros(shape=(1,300)) ones = np.ones(shape=(1,300)) for i, cui in enumerate(cdb1.cui2names): - cdb1.cui2context_vectors[cui] = {"short" : zeroes} - cdb2.cui2context_vectors[cui] = {"short" : ones} + cdb1.cui2context_vectors[cui] = {"short" : ones} + cdb2.cui2context_vectors[cui] = {"short" : zeroes} cdb1.cui2count_train[cui] = 1 - cdb2.cui2count_train[cui] = i - test_add = {"test": {'tokens': "test_token", 'snames': "test_sname", 'raw_name': "test_raw_name", "is_upper" : "P"}} + cdb2.cui2count_train[cui] = i + 1 + test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} cdb1.add_names("C0006826", test_add) + unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} + cdb2.add_names("UniqueTest", unique_test) + cdb2.cui2context_vectors["UniqueTest"] = {"short" : ones} # merging cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) + # tests + self.assertIn("test", cdb.cui2names["C0006826"]) + self.assertIn("test_name", cdb.cui2snames["C0006826"]) + self.assertEqual("Cancer", cdb.cui2preferred_name["C0006826"]) + self.assertTrue(np.array_equal(np.ones(shape=(1,300)), cdb.cui2context_vectors["UniqueTest"]["short"])) + base = np.ones(shape=(1,300)) + for i, cui in enumerate(cdb1.cui2names): + self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(base, i+2))) From f96758aaa857ffd034726fbc84b830793435c0aa Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Tue, 12 Dec 2023 14:06:31 +0000 Subject: [PATCH 10/17] fixing lint issues --- medcat/cdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index f7800aeb6..1870b8f7a 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -967,7 +967,7 @@ def merge_cdb(cdb1: "CDB", for cui in cdb2.cui2names: names = dict() for name in cdb2.cui2names[cui]: - names[name] = {'snames' : cdb2.cui2snames.get(cui, set()), 'is_upper' : cdb2.name_isupper.get(name, False), 'tokens' : {}} + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}} name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' ontologies = set() description = '' @@ -1013,7 +1013,7 @@ def merge_cdb(cdb1: "CDB", cdb.cui2tags[cui] = cdb2.cui2tags[cui] if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - + for name in cdb2.name2cuis: if name in cdb1.name2cuis: # if they exist in both cdbs if name in cdb1.name2count_train and name in cdb2.name2count_train: From 1975b1c13b657966ff76c1c075e164e79cec5452 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Tue, 12 Dec 2023 14:24:16 +0000 Subject: [PATCH 11/17] fixing flake8 linting --- medcat/cdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 1870b8f7a..d773d1f4f 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -1032,5 +1032,5 @@ def merge_cdb(cdb1: "CDB", cdb.vocab[word] += cdb2.vocab[word] else: cdb.vocab[word] = cdb2.vocab[word] - + return cdb From 6f752c8d2d10768fe9e3a822eb26a1b5aa973aa7 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 13 Dec 2023 20:09:14 +0000 Subject: [PATCH 12/17] bug fixes, additional tests, and more documentation --- medcat/cdb.py | 51 ++++++++++++++++++++++++----------------------- tests/test_cdb.py | 31 ++++++++++++++++++---------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index d773d1f4f..1737b4bad 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -932,6 +932,7 @@ def merge_cdb(cdb1: "CDB", overwrite_training: int = 0, full_build: bool = False): """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. + `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` Args: cdb1 (medcat.cdb.CDB): @@ -967,35 +968,33 @@ def merge_cdb(cdb1: "CDB", for cui in cdb2.cui2names: names = dict() for name in cdb2.cui2names[cui]: - names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}} + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + # For addl_info check cui2original_names as they MUST be added ontologies = set() description = '' - # For addl_info check cui2original_names as they MUST be added - if full_build and cui in cdb2.addl_info['cui2original_names']: + to_build = False + if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): + to_build = True if 'cui2ontologies' in cdb2.addl_info: ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) if 'cui2description' in cdb2.addl_info: description = cdb2.addl_info['cui2description'][cui] cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, - type_ids=cdb2.cui2type_ids[cui], description=description, full_build=full_build) + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) if cui in cdb1.cui2names: - if cui in cdb1.cui2count_train or cui in cdb2.cui2count_train: - if overwrite_training == 1 and cui in cdb1.cui2count_train[cui]: - cdb.cui2count_train[cui] = cdb1.cui2count_train[cui] - elif overwrite_training == 2 and cui in cdb2.cui2count_train[cui]: + if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): + if overwrite_training == 2 and cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] else: cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) - if cui in cdb1.cui2context_vectors: - contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short - if overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]: - weights = [1, 0] - elif overwrite_training == 2 and cui in cdb2.cui2context_vectors[cui]: + if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): + if overwrite_training == 2 and cui in cdb2.cui2context_vectors: weights = [0, 1] else: norm = cdb.cui2count_train[cui] weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short for s in contexts: cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) if cui in cdb1.cui2tags: @@ -1014,23 +1013,25 @@ def merge_cdb(cdb1: "CDB", if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - for name in cdb2.name2cuis: - if name in cdb1.name2cuis: # if they exist in both cdbs - if name in cdb1.name2count_train and name in cdb2.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] + if overwrite_training != 1: + for name in cdb2.name2cuis: + if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] # snames cdb.snames = cdb1.snames.union(cdb2.snames) # vocab, adding counts if they occur in both cdb.vocab = deepcopy(cdb1.vocab) - for word in cdb2.vocab: - if word in cdb.vocab: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] + if overwrite_training != 1: + for word in cdb2.vocab: + if word in cdb.vocab and overwrite_training == 0: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] return cdb diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 29c603daa..3ff7e5dad 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -101,32 +101,43 @@ def test_merge_cdb(self): cdb1 = maker1.prepare_csvs(csv_paths=[path]) cdb2 = maker2.prepare_csvs(csv_paths=[path]) - # generating vectors and setting up + # generating context vectors here for for testing the weighted average function (based off cui2count_train) zeroes = np.zeros(shape=(1,300)) ones = np.ones(shape=(1,300)) for i, cui in enumerate(cdb1.cui2names): - cdb1.cui2context_vectors[cui] = {"short" : ones} - cdb2.cui2context_vectors[cui] = {"short" : zeroes} + cdb1.cui2context_vectors[cui] = {"short": ones} + cdb2.cui2context_vectors[cui] = {"short": zeroes} cdb1.cui2count_train[cui] = 1 cdb2.cui2count_train[cui] = i + 1 - test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} + # adding new names and cuis to each cdb to test after merging + test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} cdb1.add_names("C0006826", test_add) - unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} + unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} cdb2.add_names("UniqueTest", unique_test) - cdb2.cui2context_vectors["UniqueTest"] = {"short" : ones} + cdb2.cui2context_vectors["UniqueTest"] = {"short": zeroes} + cdb2.addl_info["cui2ontologies"] = {} + cdb2.addl_info["cui2description"] = {} + for cui in cdb2.cui2names: + cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] + cdb2.addl_info["cui2description"][cui] = "test_description" # merging cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) + overwrite_cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2, overwrite_training=2, full_build=True) # tests self.assertIn("test", cdb.cui2names["C0006826"]) self.assertIn("test_name", cdb.cui2snames["C0006826"]) self.assertEqual("Cancer", cdb.cui2preferred_name["C0006826"]) - self.assertTrue(np.array_equal(np.ones(shape=(1,300)), cdb.cui2context_vectors["UniqueTest"]["short"])) - base = np.ones(shape=(1,300)) + self.assertTrue(np.array_equal(zeroes, cdb.cui2context_vectors["UniqueTest"]["short"])) for i, cui in enumerate(cdb1.cui2names): - self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(base, i+2))) - + self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(ones, i+2))) + self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) + self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) + for cui in cdb2.cui2names: + self.assertTrue(np.array_equal(overwrite_cdb.cui2context_vectors[cui]["short"], zeroes)) + self.assertEqual(overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) + self.assertEqual(overwrite_cdb.addl_info["cui2description"][cui], "test_description") if __name__ == '__main__': From 7d694f2ecaa1025a08a504fb26f45c0fb0e2d3ab Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 13 Dec 2023 20:41:06 +0000 Subject: [PATCH 13/17] moved set up of cdbs to be merged to tests.helper --- tests/helper.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_cdb.py | 30 ++++-------------------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/tests/helper.py b/tests/helper.py index 9fb66589b..3da571758 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -6,6 +6,8 @@ import numpy as np from medcat.vocab import Vocab +from medcat.cdb_maker import CDBMaker +from medcat.config import Config class AsyncMock(unittest.mock.MagicMock): @@ -86,3 +88,36 @@ def check_or_download(self): return with open(self.vocab_path, 'wb') as f: f.write(tmp.content) + + +class ForCDBMerging: + + def __init__(self) -> None: + # generating cdbs - two maker are requested as they point to the same created CDB. + config = Config() + config.general["spacy_model"] = "en_core_web_md" + maker1 = CDBMaker(config) + maker2 = CDBMaker(config) # second maker is required as it will otherwise point to same object + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") + self.cdb1 = maker1.prepare_csvs(csv_paths=[path]) + self.cdb2 = maker2.prepare_csvs(csv_paths=[path]) + + # generating context vectors here for for testing the weighted average function (based off cui2count_train) + zeroes = np.zeros(shape=(1,300)) + ones = np.ones(shape=(1,300)) + for i, cui in enumerate(self.cdb1.cui2names): + self.cdb1.cui2context_vectors[cui] = {"short": ones} + self.cdb2.cui2context_vectors[cui] = {"short": zeroes} + self.cdb1.cui2count_train[cui] = 1 + self.cdb2.cui2count_train[cui] = i + 1 + # adding new names and cuis to each cdb to test after merging + test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} + self.cdb1.add_names("C0006826", test_add) + unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} + self.cdb2.add_names("UniqueTest", unique_test) + self.cdb2.cui2context_vectors["UniqueTest"] = {"short": zeroes} + self.cdb2.addl_info["cui2ontologies"] = {} + self.cdb2.addl_info["cui2description"] = {} + for cui in self.cdb2.cui2names: + self.cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] + self.cdb2.addl_info["cui2description"][cui] = "test_description" diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 3ff7e5dad..08b0cee88 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -7,6 +7,7 @@ from medcat.config import Config from medcat.cdb_maker import CDBMaker from medcat.cdb import CDB +from .helper import ForCDBMerging class CDBTests(unittest.TestCase): @@ -92,34 +93,11 @@ def test_cui2snames_population(self): def test_merge_cdb(self): - # generating cdbs - two maker are requested as they point to the same created CDB. - config = Config() - config.general["spacy_model"] = "en_core_web_md" - maker1 = CDBMaker(config) - maker2 = CDBMaker(config) # second maker is required as it will otherwise point to same object - path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") - cdb1 = maker1.prepare_csvs(csv_paths=[path]) - cdb2 = maker2.prepare_csvs(csv_paths=[path]) - - # generating context vectors here for for testing the weighted average function (based off cui2count_train) + to_merge = ForCDBMerging() + cdb1 = to_merge.cdb1 + cdb2 = to_merge.cdb2 zeroes = np.zeros(shape=(1,300)) ones = np.ones(shape=(1,300)) - for i, cui in enumerate(cdb1.cui2names): - cdb1.cui2context_vectors[cui] = {"short": ones} - cdb2.cui2context_vectors[cui] = {"short": zeroes} - cdb1.cui2count_train[cui] = 1 - cdb2.cui2count_train[cui] = i + 1 - # adding new names and cuis to each cdb to test after merging - test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} - cdb1.add_names("C0006826", test_add) - unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} - cdb2.add_names("UniqueTest", unique_test) - cdb2.cui2context_vectors["UniqueTest"] = {"short": zeroes} - cdb2.addl_info["cui2ontologies"] = {} - cdb2.addl_info["cui2description"] = {} - for cui in cdb2.cui2names: - cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] - cdb2.addl_info["cui2description"][cui] = "test_description" # merging cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) From 7cdd208c69eedf43c176cbe2fbbeba7de17ecd9f Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 14 Dec 2023 17:56:37 +0000 Subject: [PATCH 14/17] moved merge_cdb to utils and created test_cdb_utils --- medcat/cdb.py | 111 ------------------------------- medcat/utils/cdb_utils.py | 120 ++++++++++++++++++++++++++++++++++ tests/helper.py | 2 +- tests/test_cdb.py | 27 -------- tests/utils/test_cdb_utils.py | 42 ++++++++++++ 5 files changed, 163 insertions(+), 139 deletions(-) create mode 100644 medcat/utils/cdb_utils.py create mode 100644 tests/utils/test_cdb_utils.py diff --git a/medcat/cdb.py b/medcat/cdb.py index 1737b4bad..2ca8382a7 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -7,7 +7,6 @@ import numpy as np from typing import Dict, Set, Optional, List, Union, cast from functools import partial -from copy import deepcopy import os from medcat import __version__ @@ -925,113 +924,3 @@ def calculate_hash(self): self._hash = hasher.hexdigest() logger.info("Found new CDB hash: %s", self._hash) return self._hash - - @staticmethod - def merge_cdb(cdb1: "CDB", - cdb2: "CDB", - overwrite_training: int = 0, - full_build: bool = False): - """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. - `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` - - Args: - cdb1 (medcat.cdb.CDB): - The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as - cui2preferred_name), this cdb values will be prioritised over cdb2. - cdb2 (medcat.cdb.CDB): - The second medcat cdb to merge. - overwrite_training (int): - Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 - full_build (bool): - Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" - """ - config = deepcopy(cdb1.config) - cdb = CDB(config) - - # Copy CDB 1 - as all settings from CDB 1 will be carried over - cdb.cui2names = deepcopy(cdb1.cui2names) - cdb.cui2snames = deepcopy(cdb1.cui2snames) - cdb.cui2count_train = deepcopy(cdb1.cui2count_train) - cdb.cui2info = deepcopy(cdb1.cui2info) - cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) - cdb.cui2tags = deepcopy(cdb1.cui2tags) - cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) - cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) - cdb.name2cuis = deepcopy(cdb1.name2cuis) - cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) - cdb.name2count_train = deepcopy(cdb1.name2count_train) - cdb.name_isupper = deepcopy(cdb1.name_isupper) - if full_build: - cdb.addl_info = deepcopy(cdb1.addl_info) - - # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name - for cui in cdb2.cui2names: - names = dict() - for name in cdb2.cui2names[cui]: - names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} - name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' - # For addl_info check cui2original_names as they MUST be added - ontologies = set() - description = '' - to_build = False - if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): - to_build = True - if 'cui2ontologies' in cdb2.addl_info: - ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) - if 'cui2description' in cdb2.addl_info: - description = cdb2.addl_info['cui2description'][cui] - cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, - type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) - if cui in cdb1.cui2names: - if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): - if overwrite_training == 2 and cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] - else: - cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) - if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): - if overwrite_training == 2 and cui in cdb2.cui2context_vectors: - weights = [0, 1] - else: - norm = cdb.cui2count_train[cui] - weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] - contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short - for s in contexts: - cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) - if cui in cdb1.cui2tags: - cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: - cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) - else: - if cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: - cdb.cui2info[cui] = cdb2.cui2info[cui] - if cui in cdb2.cui2context_vectors: - cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: - cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: - cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - - if overwrite_training != 1: - for name in cdb2.name2cuis: - if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs - if name in cdb1.name2count_train and name in cdb2.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] - - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) - - # vocab, adding counts if they occur in both - cdb.vocab = deepcopy(cdb1.vocab) - if overwrite_training != 1: - for word in cdb2.vocab: - if word in cdb.vocab and overwrite_training == 0: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] - - return cdb diff --git a/medcat/utils/cdb_utils.py b/medcat/utils/cdb_utils.py new file mode 100644 index 000000000..b7097c3f6 --- /dev/null +++ b/medcat/utils/cdb_utils.py @@ -0,0 +1,120 @@ +import logging +import numpy as np + +from copy import deepcopy +from medcat.cdb import CDB + +logger = logging.getLogger(__name__) # separate logger from the package-level one + + +class cdb_utils(object): + + @staticmethod + def merge_cdb(cdb1: "CDB", + cdb2: "CDB", + overwrite_training: int = 0, + full_build: bool = False): + """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. + `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` + + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite_training (int): + Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 + full_build (bool): + Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" + """ + config = deepcopy(cdb1.config) + cdb = CDB(config) + + # Copy CDB 1 - as all settings from CDB 1 will be carried over + cdb.cui2names = deepcopy(cdb1.cui2names) + cdb.cui2snames = deepcopy(cdb1.cui2snames) + cdb.cui2count_train = deepcopy(cdb1.cui2count_train) + cdb.cui2info = deepcopy(cdb1.cui2info) + cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) + cdb.cui2tags = deepcopy(cdb1.cui2tags) + cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) + cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) + cdb.name2cuis = deepcopy(cdb1.name2cuis) + cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) + cdb.name2count_train = deepcopy(cdb1.name2count_train) + cdb.name_isupper = deepcopy(cdb1.name_isupper) + if full_build: + cdb.addl_info = deepcopy(cdb1.addl_info) + + # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name + for cui in cdb2.cui2names: + names = dict() + for name in cdb2.cui2names[cui]: + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} + name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + # For addl_info check cui2original_names as they MUST be added + ontologies = set() + description = '' + to_build = False + if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): + to_build = True + if 'cui2ontologies' in cdb2.addl_info: + ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) + if 'cui2description' in cdb2.addl_info: + description = cdb2.addl_info['cui2description'][cui] + cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) + if cui in cdb1.cui2names: + if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): + if overwrite_training == 2 and cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + else: + cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) + if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): + if overwrite_training == 2 and cui in cdb2.cui2context_vectors: + weights = [0, 1] + else: + norm = cdb.cui2count_train[cui] + weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short + for s in contexts: + cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + if cui in cdb1.cui2tags: + cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: + cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + else: + if cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: + cdb.cui2info[cui] = cdb2.cui2info[cui] + if cui in cdb2.cui2context_vectors: + cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: + cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: + cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + + if overwrite_training != 1: + for name in cdb2.name2cuis: + if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] + + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) + + # vocab, adding counts if they occur in both + cdb.vocab = deepcopy(cdb1.vocab) + if overwrite_training != 1: + for word in cdb2.vocab: + if word in cdb.vocab and overwrite_training == 0: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] + + return cdb diff --git a/tests/helper.py b/tests/helper.py index 3da571758..52943c3cd 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -119,5 +119,5 @@ def __init__(self) -> None: self.cdb2.addl_info["cui2ontologies"] = {} self.cdb2.addl_info["cui2description"] = {} for cui in self.cdb2.cui2names: - self.cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] + self.cdb2.addl_info["cui2ontologies"][cui] = {"test_ontology"} self.cdb2.addl_info["cui2description"][cui] = "test_description" diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 08b0cee88..eb98e28ba 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -7,7 +7,6 @@ from medcat.config import Config from medcat.cdb_maker import CDBMaker from medcat.cdb import CDB -from .helper import ForCDBMerging class CDBTests(unittest.TestCase): @@ -92,31 +91,5 @@ def test_cui2snames_population(self): self.assertIn(cui, self.undertest.cui2snames) - def test_merge_cdb(self): - to_merge = ForCDBMerging() - cdb1 = to_merge.cdb1 - cdb2 = to_merge.cdb2 - zeroes = np.zeros(shape=(1,300)) - ones = np.ones(shape=(1,300)) - - # merging - cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) - overwrite_cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2, overwrite_training=2, full_build=True) - - # tests - self.assertIn("test", cdb.cui2names["C0006826"]) - self.assertIn("test_name", cdb.cui2snames["C0006826"]) - self.assertEqual("Cancer", cdb.cui2preferred_name["C0006826"]) - self.assertTrue(np.array_equal(zeroes, cdb.cui2context_vectors["UniqueTest"]["short"])) - for i, cui in enumerate(cdb1.cui2names): - self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(ones, i+2))) - self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) - self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) - for cui in cdb2.cui2names: - self.assertTrue(np.array_equal(overwrite_cdb.cui2context_vectors[cui]["short"], zeroes)) - self.assertEqual(overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) - self.assertEqual(overwrite_cdb.addl_info["cui2description"][cui], "test_description") - - if __name__ == '__main__': unittest.main() diff --git a/tests/utils/test_cdb_utils.py b/tests/utils/test_cdb_utils.py new file mode 100644 index 000000000..3f699d767 --- /dev/null +++ b/tests/utils/test_cdb_utils.py @@ -0,0 +1,42 @@ +import unittest +import numpy as np +from tests.helper import ForCDBMerging +from medcat.utils.cdb_utils import cdb_utils + + +class CDBMergeTests(unittest.TestCase): + @classmethod + def setUp(cls) -> None: + to_merge = ForCDBMerging() + cls.cdb1 = to_merge.cdb1 + cls.cdb2 = to_merge.cdb2 + cls.merged_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2) + cls.overwrite_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2, overwrite_training=2, full_build=True) + cls.zeroes = np.zeros(shape=(1,300)) + cls.ones = np.ones(shape=(1,300)) + + def test_merge_inserts(self): + self.assertIn("test", self.merged_cdb.cui2names["C0006826"]) + self.assertIn("test_name", self.merged_cdb.cui2snames["C0006826"]) + self.assertEqual("Cancer", self.merged_cdb.cui2preferred_name["C0006826"]) + + def test_no_full_build(self): + self.assertEqual(self.merged_cdb.addl_info["cui2ontologies"], dict()) + self.assertEqual(self.merged_cdb.addl_info["cui2ontologies"], dict()) + + def test_full_build(self): + for cui in self.cdb2.cui2names: + self.assertEqual(self.overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) + self.assertEqual(self.overwrite_cdb.addl_info["cui2description"][cui], "test_description") + + def test_vector_merge(self): + self.assertTrue(np.array_equal(self.zeroes, self.merged_cdb.cui2context_vectors["UniqueTest"]["short"])) + for i, cui in enumerate(self.cdb1.cui2names): + self.assertTrue(np.array_equal(self.merged_cdb.cui2context_vectors[cui]["short"], np.divide(self.ones, i+2))) + + + def test_overwrite_parameter(self): + for cui in self.cdb2.cui2names: + self.assertTrue(np.array_equal(self.overwrite_cdb.cui2context_vectors[cui]["short"], self.zeroes)) + self.assertEqual(self.overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) + self.assertEqual(self.overwrite_cdb.addl_info["cui2description"][cui], "test_description") From fe9ef662cc9446c7556df1da76a2f9d040397632 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 15 Dec 2023 09:54:38 +0000 Subject: [PATCH 15/17] removed class wrapper in cdb utils and fixed class set up in tests --- medcat/utils/cdb_utils.py | 203 +++++++++++++++++----------------- tests/utils/test_cdb_utils.py | 18 +-- 2 files changed, 109 insertions(+), 112 deletions(-) diff --git a/medcat/utils/cdb_utils.py b/medcat/utils/cdb_utils.py index b7097c3f6..445fb7d6f 100644 --- a/medcat/utils/cdb_utils.py +++ b/medcat/utils/cdb_utils.py @@ -7,114 +7,111 @@ logger = logging.getLogger(__name__) # separate logger from the package-level one -class cdb_utils(object): +def merge_cdb(cdb1: "CDB", + cdb2: "CDB", + overwrite_training: int = 0, + full_build: bool = False): + """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. + `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` - @staticmethod - def merge_cdb(cdb1: "CDB", - cdb2: "CDB", - overwrite_training: int = 0, - full_build: bool = False): - """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. - `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite_training (int): + Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 + full_build (bool): + Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" + """ + config = deepcopy(cdb1.config) + cdb = CDB(config) - Args: - cdb1 (medcat.cdb.CDB): - The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as - cui2preferred_name), this cdb values will be prioritised over cdb2. - cdb2 (medcat.cdb.CDB): - The second medcat cdb to merge. - overwrite_training (int): - Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 - full_build (bool): - Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" - """ - config = deepcopy(cdb1.config) - cdb = CDB(config) + # Copy CDB 1 - as all settings from CDB 1 will be carried over + cdb.cui2names = deepcopy(cdb1.cui2names) + cdb.cui2snames = deepcopy(cdb1.cui2snames) + cdb.cui2count_train = deepcopy(cdb1.cui2count_train) + cdb.cui2info = deepcopy(cdb1.cui2info) + cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) + cdb.cui2tags = deepcopy(cdb1.cui2tags) + cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) + cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) + cdb.name2cuis = deepcopy(cdb1.name2cuis) + cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) + cdb.name2count_train = deepcopy(cdb1.name2count_train) + cdb.name_isupper = deepcopy(cdb1.name_isupper) + if full_build: + cdb.addl_info = deepcopy(cdb1.addl_info) - # Copy CDB 1 - as all settings from CDB 1 will be carried over - cdb.cui2names = deepcopy(cdb1.cui2names) - cdb.cui2snames = deepcopy(cdb1.cui2snames) - cdb.cui2count_train = deepcopy(cdb1.cui2count_train) - cdb.cui2info = deepcopy(cdb1.cui2info) - cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) - cdb.cui2tags = deepcopy(cdb1.cui2tags) - cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) - cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) - cdb.name2cuis = deepcopy(cdb1.name2cuis) - cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) - cdb.name2count_train = deepcopy(cdb1.name2count_train) - cdb.name_isupper = deepcopy(cdb1.name_isupper) - if full_build: - cdb.addl_info = deepcopy(cdb1.addl_info) + # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name + for cui in cdb2.cui2names: + names = dict() + for name in cdb2.cui2names[cui]: + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} + name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + # For addl_info check cui2original_names as they MUST be added + ontologies = set() + description = '' + to_build = False + if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): + to_build = True + if 'cui2ontologies' in cdb2.addl_info: + ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) + if 'cui2description' in cdb2.addl_info: + description = cdb2.addl_info['cui2description'][cui] + cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) + if cui in cdb1.cui2names: + if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): + if overwrite_training == 2 and cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + else: + cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) + if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): + if overwrite_training == 2 and cui in cdb2.cui2context_vectors: + weights = [0, 1] + else: + norm = cdb.cui2count_train[cui] + weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short + for s in contexts: + cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + if cui in cdb1.cui2tags: + cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: + cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + else: + if cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: + cdb.cui2info[cui] = cdb2.cui2info[cui] + if cui in cdb2.cui2context_vectors: + cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: + cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: + cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name - for cui in cdb2.cui2names: - names = dict() - for name in cdb2.cui2names[cui]: - names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} - name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' - # For addl_info check cui2original_names as they MUST be added - ontologies = set() - description = '' - to_build = False - if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): - to_build = True - if 'cui2ontologies' in cdb2.addl_info: - ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) - if 'cui2description' in cdb2.addl_info: - description = cdb2.addl_info['cui2description'][cui] - cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, - type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) - if cui in cdb1.cui2names: - if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): - if overwrite_training == 2 and cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] - else: - cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) - if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): - if overwrite_training == 2 and cui in cdb2.cui2context_vectors: - weights = [0, 1] - else: - norm = cdb.cui2count_train[cui] - weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] - contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short - for s in contexts: - cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) - if cui in cdb1.cui2tags: - cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: - cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + if overwrite_training != 1: + for name in cdb2.name2cuis: + if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason else: - if cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: - cdb.cui2info[cui] = cdb2.cui2info[cui] - if cui in cdb2.cui2context_vectors: - cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: - cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: - cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - - if overwrite_training != 1: - for name in cdb2.name2cuis: - if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs - if name in cdb1.name2count_train and name in cdb2.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) - # vocab, adding counts if they occur in both - cdb.vocab = deepcopy(cdb1.vocab) - if overwrite_training != 1: - for word in cdb2.vocab: - if word in cdb.vocab and overwrite_training == 0: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] + # vocab, adding counts if they occur in both + cdb.vocab = deepcopy(cdb1.vocab) + if overwrite_training != 1: + for word in cdb2.vocab: + if word in cdb.vocab and overwrite_training == 0: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] - return cdb + return cdb diff --git a/tests/utils/test_cdb_utils.py b/tests/utils/test_cdb_utils.py index 3f699d767..bc0e6796f 100644 --- a/tests/utils/test_cdb_utils.py +++ b/tests/utils/test_cdb_utils.py @@ -1,19 +1,19 @@ import unittest import numpy as np from tests.helper import ForCDBMerging -from medcat.utils.cdb_utils import cdb_utils +from medcat.utils.cdb_utils import merge_cdb class CDBMergeTests(unittest.TestCase): - @classmethod - def setUp(cls) -> None: + + def setUp(self) -> None: to_merge = ForCDBMerging() - cls.cdb1 = to_merge.cdb1 - cls.cdb2 = to_merge.cdb2 - cls.merged_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2) - cls.overwrite_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2, overwrite_training=2, full_build=True) - cls.zeroes = np.zeros(shape=(1,300)) - cls.ones = np.ones(shape=(1,300)) + self.cdb1 = to_merge.cdb1 + self.cdb2 = to_merge.cdb2 + self.merged_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2) + self.overwrite_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2, overwrite_training=2, full_build=True) + self.zeroes = np.zeros(shape=(1,300)) + self.ones = np.ones(shape=(1,300)) def test_merge_inserts(self): self.assertIn("test", self.merged_cdb.cui2names["C0006826"]) From f70e61d97e51a7680475b12ae8b7868d0c0f1728 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 15 Dec 2023 10:07:24 +0000 Subject: [PATCH 16/17] changed test object setup to class setup --- tests/utils/test_cdb_utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/utils/test_cdb_utils.py b/tests/utils/test_cdb_utils.py index bc0e6796f..777a2506b 100644 --- a/tests/utils/test_cdb_utils.py +++ b/tests/utils/test_cdb_utils.py @@ -6,14 +6,15 @@ class CDBMergeTests(unittest.TestCase): - def setUp(self) -> None: + @classmethod + def setUpClass(cls): to_merge = ForCDBMerging() - self.cdb1 = to_merge.cdb1 - self.cdb2 = to_merge.cdb2 - self.merged_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2) - self.overwrite_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2, overwrite_training=2, full_build=True) - self.zeroes = np.zeros(shape=(1,300)) - self.ones = np.ones(shape=(1,300)) + cls.cdb1 = to_merge.cdb1 + cls.cdb2 = to_merge.cdb2 + cls.merged_cdb = merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2) + cls.overwrite_cdb = merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2, overwrite_training=2, full_build=True) + cls.zeroes = np.zeros(shape=(1,300)) + cls.ones = np.ones(shape=(1,300)) def test_merge_inserts(self): self.assertIn("test", self.merged_cdb.cui2names["C0006826"]) From c74fe1f64ff590c6bf8a139a698600b2d2087967 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 15 Dec 2023 10:24:26 +0000 Subject: [PATCH 17/17] removed erroneous new line --- tests/test_cdb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_cdb.py b/tests/test_cdb.py index eb98e28ba..f7be24d64 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -90,6 +90,5 @@ def test_cui2snames_population(self): with self.subTest(cui): self.assertIn(cui, self.undertest.cui2snames) - if __name__ == '__main__': unittest.main()