New Feature: SNOMED::ICD10CM Mapping Support

- Added feature to allow for conversion of these premade mappings provided by SNOMED into SSSOM format. (WIP) General updates - cli.py: Reorganized SSSOM_READ_FORMATS: Top half are plain data formats, and bottom half are special-case formats. Both halves of the list are alphabetically sorted. Temp updates - Changed some relative imports to absolute imports, in order to speed up development and make debugging easier. It is possible that this could be a good permanent change too, though.
mapping-commons · Mar 2, 2022 · bae870a · bae870a
1 parent bf9c32b
commit bae870a
Show file tree

Hide file tree

Showing 3 changed files with 167 additions and 10 deletions.
diff --git a/sssom/cli.py b/sssom/cli.py
@@ -24,12 +24,12 @@
 from rdflib import Graph
 from scipy.stats import chi2_contingency
 
-from .cliques import split_into_cliques, summarize_cliques
-from .io import convert_file, parse_file, split_file, validate_file
-from .parsers import read_sssom_table
-from .rdf_util import rewire_graph
-from .sparql_util import EndpointConfig, query_mappings
-from .util import (
+from sssom.cliques import split_into_cliques, summarize_cliques
+from sssom.io import convert_file, parse_file, split_file, validate_file
+from sssom.parsers import read_sssom_table
+from sssom.rdf_util import rewire_graph
+from sssom.sparql_util import EndpointConfig, query_mappings
+from sssom.util import (
     SSSOM_EXPORT_FORMATS,
     SSSOM_READ_FORMATS,
     MappingSetDataFrame,
@@ -41,7 +41,7 @@
     remove_unmatched,
     to_mapping_set_dataframe,
 )
-from .writers import write_table
+from sssom.writers import write_table
 
 # Click input options common across commands
 input_argument = click.argument("input", required=True, type=click.Path())

diff --git a/sssom/parsers.py b/sssom/parsers.py
@@ -140,6 +140,24 @@ def read_obographs_json(
     )
 
 
+def read_snomed_icd10cm_map_tsv(
+    file_path: str,
+    prefix_map: Dict[str, str] = None,
+    meta: Dict[str, str] = None,
+) -> MappingSetDataFrame:
+    """Parse special SNOMED ICD10CM mapping file and translates it into a MappingSetDataFrame.
+
+    :param file_path: The path to the obographs file
+    :param prefix_map: an optional prefix map
+    :param meta: an optional dictionary of metadata elements
+    :return: A SSSOM MappingSetDataFrame
+    """
+    raise_for_bad_path(file_path)
+    df = read_pandas(file_path)
+    df2 = from_snomed_icd10cm_map_tsv(df, prefix_map=prefix_map, meta=meta)
+    return df2
+
+
 def _get_prefix_map_and_metadata(
     prefix_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None
 ) -> Metadata:
@@ -499,6 +517,141 @@ def from_obographs(
     return to_mapping_set_dataframe(mdoc)
 
 
+def from_snomed_icd10cm_map_tsv(
+    df: pd.DataFrame,
+    prefix_map: Optional[PrefixMap] = None,
+    meta: Optional[MetadataType] = None,
+) -> MappingSetDataFrame:
+    """Convert a snomed_icd10cm_map dataframe to a MappingSetDataFrame.
+
+    :param df: A mappings dataframe
+    :param prefix_map: A prefix map
+    :param meta: A metadata dictionary
+    :return: MappingSetDataFrame
+
+    # Field descriptions
+    # - Taken from: doc_Icd10cmMapReleaseNotes_Current-en-US_US1000124_20210901.pdf
+    FIELD,DATA_TYPE,PURPOSE
+    - id,UUID,A 128 bit unsigned integer, uniquely identifying the map record
+    - effectiveTime,Time,Specifies the inclusive date at which this change becomes effective.
+    - active,Boolean,Specifies whether the member’s state was active (=1) or inactive (=0) from the nominal release date
+     specified by the effectiveTime field.
+    - moduleId,SctId,Identifies the member version’s module. Set to a child of 900000000000443000|Module| within the
+    metadata hierarchy.
+    - refSetId,SctId,Set to one of the children of the |Complex map type| concept in the metadata hierarchy.
+    - referencedComponentId,SctId,The SNOMED CT source concept ID that is the subject of the map record.
+    - mapGroup,Integer,An integer identifying a grouping of complex map records which will designate one map target at
+    the time of map rule evaluation. Source concepts that require two map targets for classification will have two sets
+    of map groups.
+    - mapPriority,Integer,Within a map group, the mapPriority specifies the order in which complex map records should be
+    evaluated to determine the correct map target.
+    - mapRule,String,A machine-readable rule, (evaluating to either ‘true’ or ‘false’ at run-time) that indicates
+    whether this map record should be selected within its map group
+    - mapAdvice,String,Human-readable advice that may be employed by the software vendor to give an end-user advice on
+    selection of the appropriate target code. This includes a) a summary statement of the map rule logic, b) a statement
+     of any limitations of the map record and c) additional classification guidance for the coding professional.
+    - mapTarget,String,The target ICD-10 classification code of the map record.
+    - correlationId,SctId,A child of |Map correlation value| in the metadata hierarchy, identifying the correlation
+    between the SNOMED CT concept and the target code.
+    - mapCategoryId,SctId,Identifies the SNOMED CT concept in the metadata hierarchy which is the MapCategory for the
+    associated map record. This is a subtype of 447634004 |ICD-10 Map Category value|.
+    """
+    # TODO: If using in the end, import at top of file
+    from dateutil import parser as date_parser
+    from .sssom_datamodel import MatchTypeEnum
+
+    prefix_map = _ensure_prefix_map(prefix_map)
+    ms = _init_mapping_set(meta)
+    # https://www.findacode.com/snomed/447561005--snomed-ct-source-code-to-target-map-correlation-not-specified.html
+    match_type_snomed_unspecified_id = 447561005
+
+    mlist: List[Mapping] = []
+    for _, row in df.iterrows():
+        # This may look redundant, but I want to be explicit. In officially downloaded SNOMED mappings, all of them
+        # had correlationId of 447561005, which also happens to be 'unspecified'.
+        match_type = MatchTypeEnum('Unspecified') if row['correlationId'] == match_type_snomed_unspecified_id \
+            else  MatchTypeEnum('Unspecified')
+        # TODO: SNOMED: parse as many as possible:
+        # - id --> other
+        # - active --> other
+        # - moduleId --> subject_category?
+        # - refsetId --> subject_category?
+        # - mapGroup  --> other?
+        # - mapPriority --> other?
+        # - mapRule  --> objectCategory? other?
+        #   ex: TRUE: when "ALWAYS <code>" is in pipe-delimited list in mapAdvice, this always shows TRUE. Does this
+        #       mean I could use skos:exactMatch in these cases?
+        # - mapAdvice --> objectCategory? other?
+        #   ex: "ALWAYS Q71.30 | CONSIDER LATERALITY SPECIFICATION"
+        #   any way to determine skos:narrowerThan, etc, from this?
+
+        # TODO: SSSOM: use as many as possible
+        # https://mapping-commons.github.io/sssom/Mapping/
+        # - subject_category: Optional[str] = None
+        # Description: The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary.
+        # Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".)
+        # Example: biolink:Gene (The CURIE of the biolink class for genes.)
+        # - predicate_modifier
+        # Description: A modifier for negating the prediate. See https://github.com/mapping-commons/sssom/issues/40 for discussion
+        # Range: PredicateModifierEnum: (joe: only lists 'Not' as an option)
+        # Example: Not Negates the predicate, see documentation of predicate_modifier_enum
+        # - object_category
+        # Description: The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary.
+        # Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".)
+        # Example: biolink:Gene (The CURIE of the biolink class for genes.)
+
+        # TODO: Make GH Issue: Suggest JSON instead of pipe-delimiated. Also it doesn't say what should be the delimiter
+        #  ...between key/val pairs (which I'm guessing is either = or :).
+        # - other: Description: Pipe separated list of key value pairs for properties not part of the SSSOM spec.
+        #   Can be used to encode additional provenance data.
+        # - comment: Description: Free text field containing either curator notes or text generated by tool providing
+        #   additional informative information.
+
+        # - author_id: can this be "SNOMED"?
+        # - author_label: can this be "SNOMED"?
+        # - reviewer_id: can this be "SNOMED"?
+        # - reviewer_label: can this be "SNOMED"?
+        # - creator_id: can this be "SNOMED"?
+        # - creator_label: can this be "SNOMED"?
+        # - license: Is this something that can be determined?
+        # - subject_source: URL of some official page for SNOMED version used?
+        # - subject_source_version: Is this knowable?
+        # - object_source: URL of some official page for ICD10CM version used?
+        # - object_source_version: would this be "10CM" as in "ICD10CM"? Or something else? Or nothing?
+        # - mapping_provider: can this be "SNOMED"?
+        # - mapping_cardinality: Could I determine 1:1 or 1:many or many:1 based on:
+        #   mapGroup, mapPriority, mapRule, mapAdvice?
+        # - match_term_type: What is this?
+        # - see_also: Should this be a URL to the SNOMED term?
+        mdict = {
+            'subject_id': f'SNOMED:{row["referencedComponentId"]}',
+            'subject_label': row['referencedComponentName'],
+            # Does this represent what we want for our mapping predicate? Or is correlationId more suitable?
+            # ...or is there a SKOS predicate I can map to in case where predicate is unknown? I think most of these
+            # ...mappings are attempts at exact matches, but I can't be sure (at least not without using these fields
+            # ...to determine: mapGroup, mapPriority, mapRule, mapAdvice).
+            'predicate_id': f'SNOMED:{row["mapCategoryId"]}',
+            'predicate_label': row['mapCategoryName'],
+            'object_id': f'ICD10CM:{row["mapTarget"]}',
+            'object_label': row['mapTargetName'],
+            # If correlationId is indeed more appropriate for predicate_id, then I don't think there is a representative
+            # ...field for 'match_type'.
+            'match_type': match_type,
+            'mapping_date': date_parser.parse(str(row['effectiveTime'])).date(),
+            # 'xxx2': 'yyy',
+            # 'xxx3': 'yyy',
+            # 'xxx4': 'yyy',
+            # 'xxx5': 'yyy',
+            # 'xxx6': 'yyy',
+        }
+        mlist.append(_prepare_mapping(Mapping(**mdict)))
+
+    ms.mappings = mlist  # type:ignore
+    _set_metadata_in_mapping_set(mapping_set=ms, metadata=meta)
+    doc = MappingSetDocument(mapping_set=ms, prefix_map=prefix_map)
+    return to_mapping_set_dataframe(doc)
+
+
 # All from_* take as an input a python object (data frame, json, etc) and return a MappingSetDataFrame
 # All read_* take as an input a a file handle and return a MappingSetDataFrame (usually wrapping a from_* method)
 
@@ -523,6 +676,9 @@ def get_parsing_function(input_format: Optional[str], filename: str) -> Callable
         return read_alignment_xml
     elif input_format == "obographs-json":
         return read_obographs_json
+    elif input_format == "snomed-icd10cm-map-tsv":
+        return read_snomed_icd10cm_map_tsv
+
     else:
         raise Exception(f"Unknown input format: {input_format}")
 

diff --git a/sssom/util.py b/sssom/util.py
@@ -43,12 +43,13 @@
 PREFIX_MAP_KEY = "curie_map"
 
 SSSOM_READ_FORMATS = [
-    "tsv",
-    "rdf",
+    "json",
     "owl",
+    "rdf",
+    "tsv",
     "alignment-api-xml",
     "obographs-json",
-    "json",
+    "snomed-icd10cm-map-tsv"
 ]
 SSSOM_EXPORT_FORMATS = ["tsv", "rdf", "owl", "json"]