From 56412c19dcb954c8bfe9ab6a05d87b10c3f7dd3f Mon Sep 17 00:00:00 2001 From: zethson Date: Mon, 18 Nov 2024 16:45:43 +0100 Subject: [PATCH 1/2] :bug: Fix synonyms search Signed-off-by: zethson --- lamin_utils/_map_synonyms.py | 21 ++++++++------------- tests/test_standardize.py | 33 +++++++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/lamin_utils/_map_synonyms.py b/lamin_utils/_map_synonyms.py index 4122a84..5fc399f 100644 --- a/lamin_utils/_map_synonyms.py +++ b/lamin_utils/_map_synonyms.py @@ -80,13 +80,13 @@ def map_synonyms( # __agg__ is a column of identifiers based on case_sensitive df["__agg__"] = to_str(df[field], case_sensitive=case_sensitive) - # field_map is {"__agg__": field_value} for mappable values + + # Get both exact matches and synonyms field_map = pd.merge(mapped_df, df, on="__agg__").set_index("__agg__")[field] - # only runs if synonyms mapping is needed - # unique of field_map is needed here due to possible multiple matches of identifier - if len(field_map.unique()) < mapped_df.shape[0]: - # {synonym: name} + # Always check synonyms for unmatched terms + unmapped_terms = set(mapped_df["__agg__"]) - set(field_map.index) + if unmapped_terms: syn_map = explode_aggregated_column_to_map( df=df, agg_col=synonyms_field, @@ -96,14 +96,11 @@ def map_synonyms( ) if not case_sensitive: - # convert the synonyms to the same case_sensitive syn_map.index = syn_map.index.str.lower() - # TODO: allow returning duplicated entries syn_map = syn_map[syn_map.index.drop_duplicates()] - # if values are already in field_map, do not apply synonyms mapping - syn_map = { - k: v for k, v in syn_map.to_dict().items() if k not in field_map.index - } + + # Only keep synonym mappings for terms not found in field_map + syn_map = {k: v for k, v in syn_map.to_dict().items() if k in unmapped_terms} else: syn_map = {} @@ -116,7 +113,6 @@ def map_synonyms( logger.info(f"standardized {n_mapped}/{n_input} terms") if return_mapper: - # only returns mapped synonyms mapper = mapped[~mapped.isna()].to_dict() mapper = {k: v for k, v in mapper.items() if k != v} if keep is False: @@ -128,7 +124,6 @@ def map_synonyms( else: return mapper else: - # returns a list in the input order with synonyms replaced mapped_list = ( mapped.infer_objects(copy=False).fillna(mapped_df["orig_ids"]).tolist() ) diff --git a/tests/test_standardize.py b/tests/test_standardize.py index 308da65..e9f023c 100644 --- a/tests/test_standardize.py +++ b/tests/test_standardize.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest from lamin_utils._map_synonyms import ( @@ -165,6 +166,32 @@ def test_map_synonyms_unsupported_field(genes): ) +def test_early_mismatch(): + cell_types = { + "name": [ + "Plasmablast", + "conventional dendritic cell", + "plasmablast", + ], + "synonyms": [ + "", + "cDC|dendritic reticular cell|DC1|type 1 DC", + "CD27-positive|CD38-positive|CD20-negative B cell", + ], + } + df = pd.DataFrame(cell_types) + + result = standardize( + df=df, + identifiers=["Plasmablast", "cDC"], + field="name", + return_field="name", + case_sensitive=False, + synonyms_field="synonyms", + ) + assert result == ["plasmablast", "conventional dendritic cell"] + + def test_map_synonyms_empty_df(): assert ( map_synonyms( @@ -176,8 +203,6 @@ def test_map_synonyms_empty_df(): def test_to_str(): - import numpy as np - assert to_str(pd.Index(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""] assert to_str(pd.Series(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""] assert to_str( @@ -186,8 +211,6 @@ def test_to_str(): def test_not_empty_none_na(): - import numpy as np - assert not_empty_none_na(["a", None, "", np.nan]).loc[0] == "a" assert not_empty_none_na(pd.Index(["a", None, "", np.nan])).tolist() == ["a"] assert not_empty_none_na( @@ -233,8 +256,6 @@ def test_explode_aggregated_column_to_map(genes): def test_to_str_categorical_series(): - import numpy as np - df = pd.DataFrame([np.nan, None, "a"]) df[0] = df[0].astype("category") From 53b99d78a4e7069855c0676dd12ebf3770adcb7c Mon Sep 17 00:00:00 2001 From: zethson Date: Mon, 18 Nov 2024 17:20:24 +0100 Subject: [PATCH 2/2] :art: Polish Signed-off-by: zethson --- lamin_utils/_map_synonyms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lamin_utils/_map_synonyms.py b/lamin_utils/_map_synonyms.py index 5fc399f..00439aa 100644 --- a/lamin_utils/_map_synonyms.py +++ b/lamin_utils/_map_synonyms.py @@ -81,10 +81,9 @@ def map_synonyms( # __agg__ is a column of identifiers based on case_sensitive df["__agg__"] = to_str(df[field], case_sensitive=case_sensitive) - # Get both exact matches and synonyms + # field_map is {"__agg__": field_value} for mappable values field_map = pd.merge(mapped_df, df, on="__agg__").set_index("__agg__")[field] - # Always check synonyms for unmatched terms unmapped_terms = set(mapped_df["__agg__"]) - set(field_map.index) if unmapped_terms: syn_map = explode_aggregated_column_to_map( @@ -96,7 +95,9 @@ def map_synonyms( ) if not case_sensitive: + # convert the synonyms to the same case_sensitive syn_map.index = syn_map.index.str.lower() + # TODO: allow returning duplicated entries syn_map = syn_map[syn_map.index.drop_duplicates()] # Only keep synonym mappings for terms not found in field_map @@ -113,6 +114,7 @@ def map_synonyms( logger.info(f"standardized {n_mapped}/{n_input} terms") if return_mapper: + # only returns mapped synonyms mapper = mapped[~mapped.isna()].to_dict() mapper = {k: v for k, v in mapper.items() if k != v} if keep is False: @@ -124,6 +126,7 @@ def map_synonyms( else: return mapper else: + # returns a list in the input order with synonyms replaced mapped_list = ( mapped.infer_objects(copy=False).fillna(mapped_df["orig_ids"]).tolist() )