laminlabs · Zethson · Nov 20, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/lamin_utils/_map_synonyms.py b/lamin_utils/_map_synonyms.py
@@ -80,13 +80,12 @@ def map_synonyms(
 
     # __agg__ is a column of identifiers based on case_sensitive
     df["__agg__"] = to_str(df[field], case_sensitive=case_sensitive)
+
     # field_map is {"__agg__": field_value} for mappable values
     field_map = pd.merge(mapped_df, df, on="__agg__").set_index("__agg__")[field]
 
-    # only runs if synonyms mapping is needed
-    # unique of field_map is needed here due to possible multiple matches of identifier
-    if len(field_map.unique()) < mapped_df.shape[0]:
-        # {synonym: name}
+    unmapped_terms = set(mapped_df["__agg__"]) - set(field_map.index)
+    if unmapped_terms:
         syn_map = explode_aggregated_column_to_map(
             df=df,
             agg_col=synonyms_field,
@@ -100,10 +99,9 @@ def map_synonyms(
             syn_map.index = syn_map.index.str.lower()
             # TODO: allow returning duplicated entries
             syn_map = syn_map[syn_map.index.drop_duplicates()]
-        # if values are already in field_map, do not apply synonyms mapping
-        syn_map = {
-            k: v for k, v in syn_map.to_dict().items() if k not in field_map.index
-        }
+
+        # Only keep synonym mappings for terms not found in field_map
+        syn_map = {k: v for k, v in syn_map.to_dict().items() if k in unmapped_terms}
     else:
         syn_map = {}
 

diff --git a/tests/test_standardize.py b/tests/test_standardize.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 from lamin_utils._map_synonyms import (
@@ -165,6 +166,32 @@ def test_map_synonyms_unsupported_field(genes):
         )
 
 
+def test_early_mismatch():
+    cell_types = {
+        "name": [
+            "Plasmablast",
+            "conventional dendritic cell",
+            "plasmablast",
+        ],
+        "synonyms": [
+            "",
+            "cDC|dendritic reticular cell|DC1|type 1 DC",
+            "CD27-positive|CD38-positive|CD20-negative B cell",
+        ],
+    }
+    df = pd.DataFrame(cell_types)
+
+    result = standardize(
+        df=df,
+        identifiers=["Plasmablast", "cDC"],
+        field="name",
+        return_field="name",
+        case_sensitive=False,
+        synonyms_field="synonyms",
+    )
+    assert result == ["plasmablast", "conventional dendritic cell"]
+
+
 def test_map_synonyms_empty_df():
     assert (
         map_synonyms(
@@ -176,8 +203,6 @@ def test_map_synonyms_empty_df():
 
 
 def test_to_str():
-    import numpy as np
-
     assert to_str(pd.Index(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""]
     assert to_str(pd.Series(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""]
     assert to_str(
@@ -186,8 +211,6 @@ def test_to_str():
 
 
 def test_not_empty_none_na():
-    import numpy as np
-
     assert not_empty_none_na(["a", None, "", np.nan]).loc[0] == "a"
     assert not_empty_none_na(pd.Index(["a", None, "", np.nan])).tolist() == ["a"]
     assert not_empty_none_na(
@@ -233,8 +256,6 @@ def test_explode_aggregated_column_to_map(genes):
 
 
 def test_to_str_categorical_series():
-    import numpy as np
-
     df = pd.DataFrame([np.nan, None, "a"])
     df[0] = df[0].astype("category")