From 56412c19dcb954c8bfe9ab6a05d87b10c3f7dd3f Mon Sep 17 00:00:00 2001
From: zethson <lukas.heumos@posteo.net>
Date: Mon, 18 Nov 2024 16:45:43 +0100
Subject: [PATCH 1/2] :bug: Fix synonyms search

Signed-off-by: zethson <lukas.heumos@posteo.net>
---
 lamin_utils/_map_synonyms.py | 21 ++++++++-------------
 tests/test_standardize.py    | 33 +++++++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/lamin_utils/_map_synonyms.py b/lamin_utils/_map_synonyms.py
index 4122a84..5fc399f 100644
--- a/lamin_utils/_map_synonyms.py
+++ b/lamin_utils/_map_synonyms.py
@@ -80,13 +80,13 @@ def map_synonyms(
 
     # __agg__ is a column of identifiers based on case_sensitive
     df["__agg__"] = to_str(df[field], case_sensitive=case_sensitive)
-    # field_map is {"__agg__": field_value} for mappable values
+
+    # Get both exact matches and synonyms
     field_map = pd.merge(mapped_df, df, on="__agg__").set_index("__agg__")[field]
 
-    # only runs if synonyms mapping is needed
-    # unique of field_map is needed here due to possible multiple matches of identifier
-    if len(field_map.unique()) < mapped_df.shape[0]:
-        # {synonym: name}
+    # Always check synonyms for unmatched terms
+    unmapped_terms = set(mapped_df["__agg__"]) - set(field_map.index)
+    if unmapped_terms:
         syn_map = explode_aggregated_column_to_map(
             df=df,
             agg_col=synonyms_field,
@@ -96,14 +96,11 @@ def map_synonyms(
         )
 
         if not case_sensitive:
-            # convert the synonyms to the same case_sensitive
             syn_map.index = syn_map.index.str.lower()
-            # TODO: allow returning duplicated entries
             syn_map = syn_map[syn_map.index.drop_duplicates()]
-        # if values are already in field_map, do not apply synonyms mapping
-        syn_map = {
-            k: v for k, v in syn_map.to_dict().items() if k not in field_map.index
-        }
+
+        # Only keep synonym mappings for terms not found in field_map
+        syn_map = {k: v for k, v in syn_map.to_dict().items() if k in unmapped_terms}
     else:
         syn_map = {}
 
@@ -116,7 +113,6 @@ def map_synonyms(
         logger.info(f"standardized {n_mapped}/{n_input} terms")
 
     if return_mapper:
-        # only returns mapped synonyms
         mapper = mapped[~mapped.isna()].to_dict()
         mapper = {k: v for k, v in mapper.items() if k != v}
         if keep is False:
@@ -128,7 +124,6 @@ def map_synonyms(
         else:
             return mapper
     else:
-        # returns a list in the input order with synonyms replaced
         mapped_list = (
             mapped.infer_objects(copy=False).fillna(mapped_df["orig_ids"]).tolist()
         )
diff --git a/tests/test_standardize.py b/tests/test_standardize.py
index 308da65..e9f023c 100644
--- a/tests/test_standardize.py
+++ b/tests/test_standardize.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 from lamin_utils._map_synonyms import (
@@ -165,6 +166,32 @@ def test_map_synonyms_unsupported_field(genes):
         )
 
 
+def test_early_mismatch():
+    cell_types = {
+        "name": [
+            "Plasmablast",
+            "conventional dendritic cell",
+            "plasmablast",
+        ],
+        "synonyms": [
+            "",
+            "cDC|dendritic reticular cell|DC1|type 1 DC",
+            "CD27-positive|CD38-positive|CD20-negative B cell",
+        ],
+    }
+    df = pd.DataFrame(cell_types)
+
+    result = standardize(
+        df=df,
+        identifiers=["Plasmablast", "cDC"],
+        field="name",
+        return_field="name",
+        case_sensitive=False,
+        synonyms_field="synonyms",
+    )
+    assert result == ["plasmablast", "conventional dendritic cell"]
+
+
 def test_map_synonyms_empty_df():
     assert (
         map_synonyms(
@@ -176,8 +203,6 @@ def test_map_synonyms_empty_df():
 
 
 def test_to_str():
-    import numpy as np
-
     assert to_str(pd.Index(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""]
     assert to_str(pd.Series(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""]
     assert to_str(
@@ -186,8 +211,6 @@ def test_to_str():
 
 
 def test_not_empty_none_na():
-    import numpy as np
-
     assert not_empty_none_na(["a", None, "", np.nan]).loc[0] == "a"
     assert not_empty_none_na(pd.Index(["a", None, "", np.nan])).tolist() == ["a"]
     assert not_empty_none_na(
@@ -233,8 +256,6 @@ def test_explode_aggregated_column_to_map(genes):
 
 
 def test_to_str_categorical_series():
-    import numpy as np
-
     df = pd.DataFrame([np.nan, None, "a"])
     df[0] = df[0].astype("category")
 

From 53b99d78a4e7069855c0676dd12ebf3770adcb7c Mon Sep 17 00:00:00 2001
From: zethson <lukas.heumos@posteo.net>
Date: Mon, 18 Nov 2024 17:20:24 +0100
Subject: [PATCH 2/2] :art: Polish

Signed-off-by: zethson <lukas.heumos@posteo.net>
---
 lamin_utils/_map_synonyms.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lamin_utils/_map_synonyms.py b/lamin_utils/_map_synonyms.py
index 5fc399f..00439aa 100644
--- a/lamin_utils/_map_synonyms.py
+++ b/lamin_utils/_map_synonyms.py
@@ -81,10 +81,9 @@ def map_synonyms(
     # __agg__ is a column of identifiers based on case_sensitive
     df["__agg__"] = to_str(df[field], case_sensitive=case_sensitive)
 
-    # Get both exact matches and synonyms
+    # field_map is {"__agg__": field_value} for mappable values
     field_map = pd.merge(mapped_df, df, on="__agg__").set_index("__agg__")[field]
 
-    # Always check synonyms for unmatched terms
     unmapped_terms = set(mapped_df["__agg__"]) - set(field_map.index)
     if unmapped_terms:
         syn_map = explode_aggregated_column_to_map(
@@ -96,7 +95,9 @@ def map_synonyms(
         )
 
         if not case_sensitive:
+            # convert the synonyms to the same case_sensitive
             syn_map.index = syn_map.index.str.lower()
+            # TODO: allow returning duplicated entries
             syn_map = syn_map[syn_map.index.drop_duplicates()]
 
         # Only keep synonym mappings for terms not found in field_map
@@ -113,6 +114,7 @@ def map_synonyms(
         logger.info(f"standardized {n_mapped}/{n_input} terms")
 
     if return_mapper:
+        # only returns mapped synonyms
         mapper = mapped[~mapped.isna()].to_dict()
         mapper = {k: v for k, v in mapper.items() if k != v}
         if keep is False:
@@ -124,6 +126,7 @@ def map_synonyms(
         else:
             return mapper
     else:
+        # returns a list in the input order with synonyms replaced
         mapped_list = (
             mapped.infer_objects(copy=False).fillna(mapped_df["orig_ids"]).tolist()
         )