Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix standardize exact matches #94

Merged
merged 2 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions lamin_utils/_map_synonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,12 @@ def map_synonyms(

# __agg__ is a column of identifiers based on case_sensitive
df["__agg__"] = to_str(df[field], case_sensitive=case_sensitive)

# field_map is {"__agg__": field_value} for mappable values
field_map = pd.merge(mapped_df, df, on="__agg__").set_index("__agg__")[field]

# only runs if synonyms mapping is needed
# unique of field_map is needed here due to possible multiple matches of identifier
if len(field_map.unique()) < mapped_df.shape[0]:
# {synonym: name}
unmapped_terms = set(mapped_df["__agg__"]) - set(field_map.index)
if unmapped_terms:
syn_map = explode_aggregated_column_to_map(
df=df,
agg_col=synonyms_field,
Expand All @@ -100,10 +99,9 @@ def map_synonyms(
syn_map.index = syn_map.index.str.lower()
# TODO: allow returning duplicated entries
syn_map = syn_map[syn_map.index.drop_duplicates()]
# if values are already in field_map, do not apply synonyms mapping
syn_map = {
k: v for k, v in syn_map.to_dict().items() if k not in field_map.index
}

# Only keep synonym mappings for terms not found in field_map
syn_map = {k: v for k, v in syn_map.to_dict().items() if k in unmapped_terms}
else:
syn_map = {}

Expand Down
33 changes: 27 additions & 6 deletions tests/test_standardize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest
from lamin_utils._map_synonyms import (
Expand Down Expand Up @@ -165,6 +166,32 @@ def test_map_synonyms_unsupported_field(genes):
)


def test_early_mismatch():
cell_types = {
"name": [
"Plasmablast",
"conventional dendritic cell",
"plasmablast",
],
"synonyms": [
"",
"cDC|dendritic reticular cell|DC1|type 1 DC",
"CD27-positive|CD38-positive|CD20-negative B cell",
],
}
df = pd.DataFrame(cell_types)

result = standardize(
df=df,
identifiers=["Plasmablast", "cDC"],
field="name",
return_field="name",
case_sensitive=False,
synonyms_field="synonyms",
)
assert result == ["plasmablast", "conventional dendritic cell"]


def test_map_synonyms_empty_df():
assert (
map_synonyms(
Expand All @@ -176,8 +203,6 @@ def test_map_synonyms_empty_df():


def test_to_str():
import numpy as np

assert to_str(pd.Index(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""]
assert to_str(pd.Series(["A", "a", None, np.nan])).tolist() == ["a", "a", "", ""]
assert to_str(
Expand All @@ -186,8 +211,6 @@ def test_to_str():


def test_not_empty_none_na():
import numpy as np

assert not_empty_none_na(["a", None, "", np.nan]).loc[0] == "a"
assert not_empty_none_na(pd.Index(["a", None, "", np.nan])).tolist() == ["a"]
assert not_empty_none_na(
Expand Down Expand Up @@ -233,8 +256,6 @@ def test_explode_aggregated_column_to_map(genes):


def test_to_str_categorical_series():
import numpy as np

df = pd.DataFrame([np.nan, None, "a"])
df[0] = df[0].astype("category")

Expand Down
Loading