Skip to content

Commit

Permalink
ENH: make name of positive_set column full configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
mbaak committed Feb 16, 2024
1 parent 575926a commit 4f5f8da
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 14 deletions.
6 changes: 4 additions & 2 deletions emm/aggregation/base_entity_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def __init__(
gt_preprocessed_col: str = "gt_preprocessed",
aggregation_method: Literal["max_frequency_nm_score", "mean_score"] = "max_frequency_nm_score",
blacklist: list | None = None,
positive_set_col: str = "positive_set",
) -> None:
self.score_col = score_col
self.account_col = account_col
Expand All @@ -158,6 +159,7 @@ def __init__(
self.gt_preprocessed_col = gt_preprocessed_col
self.aggregation_method = aggregation_method
self.blacklist = blacklist or []
self.positive_set_col = positive_set_col

# perform very basic preprocessing to blacklist, remove abbreviations, to lower, etc.
self.blacklist = [preprocess(name) for name in self.blacklist]
Expand All @@ -171,8 +173,8 @@ def get_group(self, dataframe) -> list[str]:
group += [self.index_col]

# Useful for collect_metrics()
if "positive_set" in dataframe.columns:
group += ["positive_set"]
if self.positive_set_col in dataframe.columns:
group += [self.positive_set_col]

# Notice we lose the name_to_match 'uid' column here
return group
Expand Down
19 changes: 16 additions & 3 deletions emm/data/create_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def pandas_create_noised_data(
name_col="Name",
index_col="Index",
random_seed=None,
positive_set_col="positive_set",
):
"""Create pandas noised dataset based on company names from kvk.
Expand All @@ -274,6 +275,7 @@ def pandas_create_noised_data(
name_col: name column in csv file
index_col: name-id column in csv file (optional)
random_seed: seed to use
positive_set_col: name of positive set column in csv file, default is "positive_set".
Returns:
ground_truth and companies_noised_pd pandas dataframes
Expand Down Expand Up @@ -331,7 +333,7 @@ def pandas_create_noised_data(
pos = shuffled_ids[: len(shuffled_ids) // 2]
# ground truth only contains companies in positive set
is_in_pos = companies_pd["Index"].isin(pos)
companies_pd["positive_set"] = is_in_pos
companies_pd[positive_set_col] = is_in_pos

if split_pos_neg:
ground_truth = companies_pd[is_in_pos].copy()
Expand Down Expand Up @@ -387,6 +389,7 @@ def create_noised_data(
index_col="Index",
ret_posneg=False,
random_seed=None,
positive_set_col="positive_set",
):
"""Create spark noised dataset based on company names from kvk.
Expand All @@ -404,6 +407,7 @@ def create_noised_data(
index_col: name-id column in csv file (optional)
ret_posneg: if true also return original positive and negative spark true datasets
random_seed: seed to use
positive_set_col: name of positive set column in csv file, default is "positive_set".
Returns:
ground_truth and companies_noised_pd spark dataframes
Expand All @@ -412,8 +416,17 @@ def create_noised_data(
# location of local sample of kvk unit test dataset; downloads the dataset in case not present.
data_path, _ = retrieve_kvk_test_sample()

# name_col and index_col get renamed to Name and Index
(ground_truth_pd, companies_noised_pd, positive_noised_pd, negative_pd) = pandas_create_noised_data(
noise_level, noise_type, noise_count, split_pos_neg, data_path, name_col, index_col, random_seed
noise_level,
noise_type,
noise_count,
split_pos_neg,
data_path,
name_col,
index_col,
random_seed,
positive_set_col,
)

# Sparkify dataframes
Expand All @@ -424,7 +437,7 @@ def create_noised_data(
StructField("amount", FloatType(), True),
StructField("counterparty_account_count_distinct", IntegerType(), nullable=True),
StructField("uid", IntegerType(), nullable=True),
StructField("positive_set", BooleanType(), True),
StructField(positive_set_col, BooleanType(), True),
StructField("country", StringType(), nullable=True),
StructField("account", StringType(), True),
]
Expand Down
6 changes: 4 additions & 2 deletions emm/data/prepare_name_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def prepare_name_pairs_pd(
candidates_pd["correct"] = candidates_pd[entity_id_col] == candidates_pd[gt_entity_id_col]

# negative sample creation?
# if so, add positive_set column for negative sample creation
# if so, add positive_set_col column for negative sample creation
rng = np.random.default_rng(random_seed)
create_negative_sample_fraction = min(create_negative_sample_fraction, 1)
create_negative_sample = create_negative_sample_fraction > 0
Expand All @@ -98,7 +98,9 @@ def prepare_name_pairs_pd(
pos_ids = list(rng.choice(ids, n_positive, replace=False))
candidates_pd[positive_set_col] = candidates_pd[entity_id_col].isin(pos_ids)
elif create_negative_sample and positive_set_col in candidates_pd.columns:
logger.info("create_negative_sample_fraction is set, but positive_set already defined; using the latter.")
logger.info(
f"create_negative_sample_fraction is set, but {positive_set_col} already defined; using the latter."
)

# We remove duplicates ground-truth name candidates in the pure string similarity model (i.e. when WITHOUT_RANK_FEATURES==True)
# because we noticed that when we don't do this, the model learns that perfect match are worst than non-perfect match (like different legal form)
Expand Down
6 changes: 2 additions & 4 deletions emm/pipeline/pandas_entity_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,10 +538,8 @@ def combine_sm_results(df: pd.DataFrame, sel_cand: pd.DataFrame, test_gt: pd.Dat
res = df.join(sel_cand[["gt_entity_id", "gt_name", "gt_preprocessed", "nm_score", "score_0"]], how="left")
res["nm_score"] = res["nm_score"].fillna(-1)
res["score_0"] = res["score_0"].fillna(-1)
res["positive_set"] = res["id"].isin(test_gt["id"])
res["correct"] = ((res["positive_set"]) & (res["id"] == res["gt_entity_id"])) | (
(~res["positive_set"]) & (res["id"].isnull())
)
is_in_pos = res["id"].isin(test_gt["id"])
res["correct"] = ((is_in_pos) & (res["id"] == res["gt_entity_id"])) | ((~is_in_pos) & (res["id"].isnull()))
return res

test_candidates = self.transform(test_names_to_match.copy())
Expand Down
2 changes: 1 addition & 1 deletion emm/supervised_model/base_supervised_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def train_test_model(

y = dataset["correct"].astype(str) + dataset["no_candidate"].astype(str)
if positive_set_col in dataset.columns:
y += dataset["positive_set"].astype(str)
y += dataset[positive_set_col].astype(str)

# Train test split with consistent name-to-match account (group) and with approximately same class balance y (stratified)
# it is important to have all the name in the same account, for account matching after aggregation
Expand Down
9 changes: 7 additions & 2 deletions emm/threshold/threshold_decision.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def get_threshold_curves_parameters(
score_col: str = "nm_score",
aggregation_layer: bool = False,
aggregation_method: str = "name_clustering",
positive_set_col: str = "positive_set",
) -> dict:
"""Get threshold decision curves
Expand All @@ -80,12 +81,16 @@ def get_threshold_curves_parameters(
score_col: which score column to use, default is 'nm_score'. For aggregation use 'agg_score'.
aggregation_layer: use aggregation layer? default is False.
aggregation_method: which aggregation method is used? 'name_clustering' or 'mean_score'.
positive_set_col: name of positive set column in best candidates df. default is 'positive_set'
Returns:
dictionary with threshold decision curves
"""
best_positive_df = best_candidate_df[best_candidate_df.positive_set]
best_negative_df = best_candidate_df[~best_candidate_df.positive_set]
if positive_set_col not in best_candidate_df.columns:
msg = f"positive set column {positive_set_col} not in best_candidates df."
raise ValueError(msg)
best_positive_df = best_candidate_df[best_candidate_df[positive_set_col]]
best_negative_df = best_candidate_df[~best_candidate_df[positive_set_col]]
n_positive_names_to_match = len(best_positive_df)
name_sets = {"all": best_candidate_df, "positive": best_positive_df, "negative": best_negative_df}

Expand Down

0 comments on commit 4f5f8da

Please sign in to comment.