Skip to content

Commit

Permalink
New Feature: SNOMED::ICD10CM Mapping Support
Browse files Browse the repository at this point in the history
- Added feature to allow for conversion of these premade mappings provided by SNOMED into SSSOM format. (WIP)

General updates
- cli.py: Reorganized SSSOM_READ_FORMATS: Top half are plain data formats, and bottom half are special-case formats. Both halves of the list are alphabetically sorted.

Temp updates
- Changed some relative imports to absolute imports, in order to speed up development and make debugging easier. It is possible that this could be a good permanent change too, though.
  • Loading branch information
joeflack4 committed Mar 2, 2022
1 parent bf9c32b commit bae870a
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 10 deletions.
14 changes: 7 additions & 7 deletions sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
from rdflib import Graph
from scipy.stats import chi2_contingency

from .cliques import split_into_cliques, summarize_cliques
from .io import convert_file, parse_file, split_file, validate_file
from .parsers import read_sssom_table
from .rdf_util import rewire_graph
from .sparql_util import EndpointConfig, query_mappings
from .util import (
from sssom.cliques import split_into_cliques, summarize_cliques
from sssom.io import convert_file, parse_file, split_file, validate_file
from sssom.parsers import read_sssom_table
from sssom.rdf_util import rewire_graph
from sssom.sparql_util import EndpointConfig, query_mappings
from sssom.util import (
SSSOM_EXPORT_FORMATS,
SSSOM_READ_FORMATS,
MappingSetDataFrame,
Expand All @@ -41,7 +41,7 @@
remove_unmatched,
to_mapping_set_dataframe,
)
from .writers import write_table
from sssom.writers import write_table

# Click input options common across commands
input_argument = click.argument("input", required=True, type=click.Path())
Expand Down
156 changes: 156 additions & 0 deletions sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,24 @@ def read_obographs_json(
)


def read_snomed_icd10cm_map_tsv(
file_path: str,
prefix_map: Dict[str, str] = None,
meta: Dict[str, str] = None,
) -> MappingSetDataFrame:
"""Parse special SNOMED ICD10CM mapping file and translates it into a MappingSetDataFrame.
:param file_path: The path to the obographs file
:param prefix_map: an optional prefix map
:param meta: an optional dictionary of metadata elements
:return: A SSSOM MappingSetDataFrame
"""
raise_for_bad_path(file_path)
df = read_pandas(file_path)
df2 = from_snomed_icd10cm_map_tsv(df, prefix_map=prefix_map, meta=meta)
return df2


def _get_prefix_map_and_metadata(
prefix_map: Optional[PrefixMap] = None, meta: Optional[MetadataType] = None
) -> Metadata:
Expand Down Expand Up @@ -499,6 +517,141 @@ def from_obographs(
return to_mapping_set_dataframe(mdoc)


def from_snomed_icd10cm_map_tsv(
df: pd.DataFrame,
prefix_map: Optional[PrefixMap] = None,
meta: Optional[MetadataType] = None,
) -> MappingSetDataFrame:
"""Convert a snomed_icd10cm_map dataframe to a MappingSetDataFrame.
:param df: A mappings dataframe
:param prefix_map: A prefix map
:param meta: A metadata dictionary
:return: MappingSetDataFrame
# Field descriptions
# - Taken from: doc_Icd10cmMapReleaseNotes_Current-en-US_US1000124_20210901.pdf
FIELD,DATA_TYPE,PURPOSE
- id,UUID,A 128 bit unsigned integer, uniquely identifying the map record
- effectiveTime,Time,Specifies the inclusive date at which this change becomes effective.
- active,Boolean,Specifies whether the member’s state was active (=1) or inactive (=0) from the nominal release date
specified by the effectiveTime field.
- moduleId,SctId,Identifies the member version’s module. Set to a child of 900000000000443000|Module| within the
metadata hierarchy.
- refSetId,SctId,Set to one of the children of the |Complex map type| concept in the metadata hierarchy.
- referencedComponentId,SctId,The SNOMED CT source concept ID that is the subject of the map record.
- mapGroup,Integer,An integer identifying a grouping of complex map records which will designate one map target at
the time of map rule evaluation. Source concepts that require two map targets for classification will have two sets
of map groups.
- mapPriority,Integer,Within a map group, the mapPriority specifies the order in which complex map records should be
evaluated to determine the correct map target.
- mapRule,String,A machine-readable rule, (evaluating to either ‘true’ or ‘false’ at run-time) that indicates
whether this map record should be selected within its map group
- mapAdvice,String,Human-readable advice that may be employed by the software vendor to give an end-user advice on
selection of the appropriate target code. This includes a) a summary statement of the map rule logic, b) a statement
of any limitations of the map record and c) additional classification guidance for the coding professional.
- mapTarget,String,The target ICD-10 classification code of the map record.
- correlationId,SctId,A child of |Map correlation value| in the metadata hierarchy, identifying the correlation
between the SNOMED CT concept and the target code.
- mapCategoryId,SctId,Identifies the SNOMED CT concept in the metadata hierarchy which is the MapCategory for the
associated map record. This is a subtype of 447634004 |ICD-10 Map Category value|.
"""
# TODO: If using in the end, import at top of file
from dateutil import parser as date_parser
from .sssom_datamodel import MatchTypeEnum

prefix_map = _ensure_prefix_map(prefix_map)
ms = _init_mapping_set(meta)
# https://www.findacode.com/snomed/447561005--snomed-ct-source-code-to-target-map-correlation-not-specified.html
match_type_snomed_unspecified_id = 447561005

mlist: List[Mapping] = []
for _, row in df.iterrows():
# This may look redundant, but I want to be explicit. In officially downloaded SNOMED mappings, all of them
# had correlationId of 447561005, which also happens to be 'unspecified'.
match_type = MatchTypeEnum('Unspecified') if row['correlationId'] == match_type_snomed_unspecified_id \
else MatchTypeEnum('Unspecified')
# TODO: SNOMED: parse as many as possible:
# - id --> other
# - active --> other
# - moduleId --> subject_category?
# - refsetId --> subject_category?
# - mapGroup --> other?
# - mapPriority --> other?
# - mapRule --> objectCategory? other?
# ex: TRUE: when "ALWAYS <code>" is in pipe-delimited list in mapAdvice, this always shows TRUE. Does this
# mean I could use skos:exactMatch in these cases?
# - mapAdvice --> objectCategory? other?
# ex: "ALWAYS Q71.30 | CONSIDER LATERALITY SPECIFICATION"
# any way to determine skos:narrowerThan, etc, from this?

# TODO: SSSOM: use as many as possible
# https://mapping-commons.github.io/sssom/Mapping/
# - subject_category: Optional[str] = None
# Description: The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary.
# Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".)
# Example: biolink:Gene (The CURIE of the biolink class for genes.)
# - predicate_modifier
# Description: A modifier for negating the prediate. See https://github.com/mapping-commons/sssom/issues/40 for discussion
# Range: PredicateModifierEnum: (joe: only lists 'Not' as an option)
# Example: Not Negates the predicate, see documentation of predicate_modifier_enum
# - object_category
# Description: The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary.
# Example: UBERON:0001062 (The CURIE of the Uberon term for "anatomical entity".)
# Example: biolink:Gene (The CURIE of the biolink class for genes.)

# TODO: Make GH Issue: Suggest JSON instead of pipe-delimiated. Also it doesn't say what should be the delimiter
# ...between key/val pairs (which I'm guessing is either = or :).
# - other: Description: Pipe separated list of key value pairs for properties not part of the SSSOM spec.
# Can be used to encode additional provenance data.
# - comment: Description: Free text field containing either curator notes or text generated by tool providing
# additional informative information.

# - author_id: can this be "SNOMED"?
# - author_label: can this be "SNOMED"?
# - reviewer_id: can this be "SNOMED"?
# - reviewer_label: can this be "SNOMED"?
# - creator_id: can this be "SNOMED"?
# - creator_label: can this be "SNOMED"?
# - license: Is this something that can be determined?
# - subject_source: URL of some official page for SNOMED version used?
# - subject_source_version: Is this knowable?
# - object_source: URL of some official page for ICD10CM version used?
# - object_source_version: would this be "10CM" as in "ICD10CM"? Or something else? Or nothing?
# - mapping_provider: can this be "SNOMED"?
# - mapping_cardinality: Could I determine 1:1 or 1:many or many:1 based on:
# mapGroup, mapPriority, mapRule, mapAdvice?
# - match_term_type: What is this?
# - see_also: Should this be a URL to the SNOMED term?
mdict = {
'subject_id': f'SNOMED:{row["referencedComponentId"]}',
'subject_label': row['referencedComponentName'],
# Does this represent what we want for our mapping predicate? Or is correlationId more suitable?
# ...or is there a SKOS predicate I can map to in case where predicate is unknown? I think most of these
# ...mappings are attempts at exact matches, but I can't be sure (at least not without using these fields
# ...to determine: mapGroup, mapPriority, mapRule, mapAdvice).
'predicate_id': f'SNOMED:{row["mapCategoryId"]}',
'predicate_label': row['mapCategoryName'],
'object_id': f'ICD10CM:{row["mapTarget"]}',
'object_label': row['mapTargetName'],
# If correlationId is indeed more appropriate for predicate_id, then I don't think there is a representative
# ...field for 'match_type'.
'match_type': match_type,
'mapping_date': date_parser.parse(str(row['effectiveTime'])).date(),
# 'xxx2': 'yyy',
# 'xxx3': 'yyy',
# 'xxx4': 'yyy',
# 'xxx5': 'yyy',
# 'xxx6': 'yyy',
}
mlist.append(_prepare_mapping(Mapping(**mdict)))

ms.mappings = mlist # type:ignore
_set_metadata_in_mapping_set(mapping_set=ms, metadata=meta)
doc = MappingSetDocument(mapping_set=ms, prefix_map=prefix_map)
return to_mapping_set_dataframe(doc)


# All from_* take as an input a python object (data frame, json, etc) and return a MappingSetDataFrame
# All read_* take as an input a a file handle and return a MappingSetDataFrame (usually wrapping a from_* method)

Expand All @@ -523,6 +676,9 @@ def get_parsing_function(input_format: Optional[str], filename: str) -> Callable
return read_alignment_xml
elif input_format == "obographs-json":
return read_obographs_json
elif input_format == "snomed-icd10cm-map-tsv":
return read_snomed_icd10cm_map_tsv

else:
raise Exception(f"Unknown input format: {input_format}")

Expand Down
7 changes: 4 additions & 3 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,13 @@
PREFIX_MAP_KEY = "curie_map"

SSSOM_READ_FORMATS = [
"tsv",
"rdf",
"json",
"owl",
"rdf",
"tsv",
"alignment-api-xml",
"obographs-json",
"json",
"snomed-icd10cm-map-tsv"
]
SSSOM_EXPORT_FORMATS = ["tsv", "rdf", "owl", "json"]

Expand Down

0 comments on commit bae870a

Please sign in to comment.