From bae870a22ed4cd39b334329c1f0ba3e659eb26e8 Mon Sep 17 00:00:00 2001 From: joeflack4 Date: Thu, 24 Feb 2022 18:54:41 -0500 Subject: [PATCH] New Feature: SNOMED::ICD10CM Mapping Support - Added feature to allow for conversion of these premade mappings provided by SNOMED into SSSOM format. (WIP) General updates - cli.py: Reorganized SSSOM_READ_FORMATS: Top half are plain data formats, and bottom half are special-case formats. Both halves of the list are alphabetically sorted. Temp updates - Changed some relative imports to absolute imports, in order to speed up development and make debugging easier. It is possible that this could be a good permanent change too, though. --- sssom/cli.py | 14 ++--- sssom/parsers.py | 156 +++++++++++++++++++++++++++++++++++++++++++++++ sssom/util.py | 7 ++- 3 files changed, 167 insertions(+), 10 deletions(-) diff --git a/sssom/cli.py b/sssom/cli.py index a1e0a20b..b4b003e0 100644 --- a/sssom/cli.py +++ b/sssom/cli.py @@ -24,12 +24,12 @@ from rdflib import Graph from scipy.stats import chi2_contingency -from .cliques import split_into_cliques, summarize_cliques -from .io import convert_file, parse_file, split_file, validate_file -from .parsers import read_sssom_table -from .rdf_util import rewire_graph -from .sparql_util import EndpointConfig, query_mappings -from .util import ( +from sssom.cliques import split_into_cliques, summarize_cliques +from sssom.io import convert_file, parse_file, split_file, validate_file +from sssom.parsers import read_sssom_table +from sssom.rdf_util import rewire_graph +from sssom.sparql_util import EndpointConfig, query_mappings +from sssom.util import ( SSSOM_EXPORT_FORMATS, SSSOM_READ_FORMATS, MappingSetDataFrame, @@ -41,7 +41,7 @@ remove_unmatched, to_mapping_set_dataframe, ) -from .writers import write_table +from sssom.writers import write_table # Click input options common across commands input_argument = click.argument("input", required=True, type=click.Path()) diff --git a/sssom/parsers.py b/sssom/parsers.py index b0b4711e..47c3d2bd 100644 --- a/sssom/parsers.py +++ b/sssom/parsers.py @@ -140,6 +140,24 @@ def read_obographs_json( ) +def read_snomed_icd10cm_map_tsv( + file_path: str, + prefix_map: Dict[str, str] = None, + meta: Dict[str, str] = None, +) -> MappingSetDataFrame: + """Parse special SNOMED ICD10CM mapping file and translates it into a MappingSetDataFrame. + + :param file_path: The path to the obographs file + :param prefix_map: an optional prefix map + :param meta: an optional dictionary of metadata elements + :return: A SSSOM MappingSetDataFrame + """ + raise_for_bad_path(file_path) + df = read_pandas(file_path) + df2 = from_snomed_icd10cm_map_tsv(df, prefix_map=prefix_map, meta=meta) + return df2 + + def _get_prefix_map_and_metadata( prefix_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None ) -> Metadata: @@ -499,6 +517,141 @@ def from_obographs( return to_mapping_set_dataframe(mdoc) +def from_snomed_icd10cm_map_tsv( + df: pd.DataFrame, + prefix_map: Optional[PrefixMap] = None, + meta: Optional[MetadataType] = None, +) -> MappingSetDataFrame: + """Convert a snomed_icd10cm_map dataframe to a MappingSetDataFrame. + + :param df: A mappings dataframe + :param prefix_map: A prefix map + :param meta: A metadata dictionary + :return: MappingSetDataFrame + + # Field descriptions + # - Taken from: doc_Icd10cmMapReleaseNotes_Current-en-US_US1000124_20210901.pdf + FIELD,DATA_TYPE,PURPOSE + - id,UUID,A 128 bit unsigned integer, uniquely identifying the map record + - effectiveTime,Time,Specifies the inclusive date at which this change becomes effective. + - active,Boolean,Specifies whether the member’s state was active (=1) or inactive (=0) from the nominal release date + specified by the effectiveTime field. + - moduleId,SctId,Identifies the member version’s module. Set to a child of 900000000000443000|Module| within the + metadata hierarchy. + - refSetId,SctId,Set to one of the children of the |Complex map type| concept in the metadata hierarchy. + - referencedComponentId,SctId,The SNOMED CT source concept ID that is the subject of the map record. + - mapGroup,Integer,An integer identifying a grouping of complex map records which will designate one map target at + the time of map rule evaluation. Source concepts that require two map targets for classification will have two sets + of map groups. + - mapPriority,Integer,Within a map group, the mapPriority specifies the order in which complex map records should be + evaluated to determine the correct map target. + - mapRule,String,A machine-readable rule, (evaluating to either ‘true’ or ‘false’ at run-time) that indicates + whether this map record should be selected within its map group + - mapAdvice,String,Human-readable advice that may be employed by the software vendor to give an end-user advice on + selection of the appropriate target code. This includes a) a summary statement of the map rule logic, b) a statement + of any limitations of the map record and c) additional classification guidance for the coding professional. + - mapTarget,String,The target ICD-10 classification code of the map record. + - correlationId,SctId,A child of |Map correlation value| in the metadata hierarchy, identifying the correlation + between the SNOMED CT concept and the target code. + - mapCategoryId,SctId,Identifies the SNOMED CT concept in the metadata hierarchy which is the MapCategory for the + associated map record. This is a subtype of 447634004 |ICD-10 Map Category value|. + """ + # TODO: If using in the end, import at top of file + from dateutil import parser as date_parser + from .sssom_datamodel import MatchTypeEnum + + prefix_map = _ensure_prefix_map(prefix_map) + ms = _init_mapping_set(meta) + # https://www.findacode.com/snomed/447561005--snomed-ct-source-code-to-target-map-correlation-not-specified.html + match_type_snomed_unspecified_id = 447561005 + + mlist: List[Mapping] = [] + for _, row in df.iterrows(): + # This may look redundant, but I want to be explicit. In officially downloaded SNOMED mappings, all of them + # had correlationId of 447561005, which also happens to be 'unspecified'. + match_type = MatchTypeEnum('Unspecified') if row['correlationId'] == match_type_snomed_unspecified_id \ + else MatchTypeEnum('Unspecified') + # TODO: SNOMED: parse as many as possible: + # - id --> other + # - active --> other + # - moduleId --> subject_category? + # - refsetId --> subject_category? + # - mapGroup --> other? + # - mapPriority --> other? + # - mapRule --> objectCategory? other? + # ex: TRUE: when "ALWAYS " is in pipe-delimited list in mapAdvice, this always shows TRUE. Does this + # mean I could use skos:exactMatch in these cases? + # - mapAdvice --> objectCategory? other? + # ex: "ALWAYS Q71.30 | CONSIDER LATERALITY SPECIFICATION" + # any way to determine skos:narrowerThan, etc, from this? + + # TODO: SSSOM: use as many as possible + # https://mapping-commons.github.io/sssom/Mapping/ + # - subject_category: Optional[str] = None + # Description: The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary. + # Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".) + # Example: biolink:Gene (The CURIE of the biolink class for genes.) + # - predicate_modifier + # Description: A modifier for negating the prediate. See https://github.com/mapping-commons/sssom/issues/40 for discussion + # Range: PredicateModifierEnum: (joe: only lists 'Not' as an option) + # Example: Not Negates the predicate, see documentation of predicate_modifier_enum + # - object_category + # Description: The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary. + # Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".) + # Example: biolink:Gene (The CURIE of the biolink class for genes.) + + # TODO: Make GH Issue: Suggest JSON instead of pipe-delimiated. Also it doesn't say what should be the delimiter + # ...between key/val pairs (which I'm guessing is either = or :). + # - other: Description: Pipe separated list of key value pairs for properties not part of the SSSOM spec. + # Can be used to encode additional provenance data. + # - comment: Description: Free text field containing either curator notes or text generated by tool providing + # additional informative information. + + # - author_id: can this be "SNOMED"? + # - author_label: can this be "SNOMED"? + # - reviewer_id: can this be "SNOMED"? + # - reviewer_label: can this be "SNOMED"? + # - creator_id: can this be "SNOMED"? + # - creator_label: can this be "SNOMED"? + # - license: Is this something that can be determined? + # - subject_source: URL of some official page for SNOMED version used? + # - subject_source_version: Is this knowable? + # - object_source: URL of some official page for ICD10CM version used? + # - object_source_version: would this be "10CM" as in "ICD10CM"? Or something else? Or nothing? + # - mapping_provider: can this be "SNOMED"? + # - mapping_cardinality: Could I determine 1:1 or 1:many or many:1 based on: + # mapGroup, mapPriority, mapRule, mapAdvice? + # - match_term_type: What is this? + # - see_also: Should this be a URL to the SNOMED term? + mdict = { + 'subject_id': f'SNOMED:{row["referencedComponentId"]}', + 'subject_label': row['referencedComponentName'], + # Does this represent what we want for our mapping predicate? Or is correlationId more suitable? + # ...or is there a SKOS predicate I can map to in case where predicate is unknown? I think most of these + # ...mappings are attempts at exact matches, but I can't be sure (at least not without using these fields + # ...to determine: mapGroup, mapPriority, mapRule, mapAdvice). + 'predicate_id': f'SNOMED:{row["mapCategoryId"]}', + 'predicate_label': row['mapCategoryName'], + 'object_id': f'ICD10CM:{row["mapTarget"]}', + 'object_label': row['mapTargetName'], + # If correlationId is indeed more appropriate for predicate_id, then I don't think there is a representative + # ...field for 'match_type'. + 'match_type': match_type, + 'mapping_date': date_parser.parse(str(row['effectiveTime'])).date(), + # 'xxx2': 'yyy', + # 'xxx3': 'yyy', + # 'xxx4': 'yyy', + # 'xxx5': 'yyy', + # 'xxx6': 'yyy', + } + mlist.append(_prepare_mapping(Mapping(**mdict))) + + ms.mappings = mlist # type:ignore + _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta) + doc = MappingSetDocument(mapping_set=ms, prefix_map=prefix_map) + return to_mapping_set_dataframe(doc) + + # All from_* take as an input a python object (data frame, json, etc) and return a MappingSetDataFrame # All read_* take as an input a a file handle and return a MappingSetDataFrame (usually wrapping a from_* method) @@ -523,6 +676,9 @@ def get_parsing_function(input_format: Optional[str], filename: str) -> Callable return read_alignment_xml elif input_format == "obographs-json": return read_obographs_json + elif input_format == "snomed-icd10cm-map-tsv": + return read_snomed_icd10cm_map_tsv + else: raise Exception(f"Unknown input format: {input_format}") diff --git a/sssom/util.py b/sssom/util.py index 6782ba13..2e81bf40 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -43,12 +43,13 @@ PREFIX_MAP_KEY = "curie_map" SSSOM_READ_FORMATS = [ - "tsv", - "rdf", + "json", "owl", + "rdf", + "tsv", "alignment-api-xml", "obographs-json", - "json", + "snomed-icd10cm-map-tsv" ] SSSOM_EXPORT_FORMATS = ["tsv", "rdf", "owl", "json"]