Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v1.3.4 #136

Merged
merged 6 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 54 additions & 32 deletions DeezyMatch/candidateRanker.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def __init__(
num_candidates=10,
search_size=4,
length_diff=None,
use_predict=True,
calc_predict=False,
calc_cosine=False,
output_path="ranker_output",
pretrained_model_path=None,
pretrained_vocab_path=None,
Expand All @@ -72,7 +73,8 @@ def __init__(
self.num_candidates = num_candidates
self.search_size = search_size
self.length_diff = length_diff
self.use_predict = use_predict
self.calc_predict = calc_predict
self.calc_cosine = calc_cosine
self.output_path = output_path
self.pretrained_model_path = pretrained_model_path
self.pretrained_vocab_path = pretrained_vocab_path
Expand All @@ -91,7 +93,8 @@ def rank(self):
num_candidates=self.num_candidates,
search_size=self.search_size,
length_diff=self.length_diff,
use_predict=self.use_predict,
calc_predict=self.calc_predict,
calc_cosine=self.calc_cosine,
output_path=self.output_path,
pretrained_model_path=self.pretrained_model_path,
pretrained_vocab_path=self.pretrained_vocab_path,
Expand All @@ -108,7 +111,8 @@ def set_query(
num_candidates=None,
search_size=None,
length_diff=None,
use_predict=True,
calc_predict=False,
calc_cosine=False,
number_test_rows=None,
output_path=None,
):
Expand All @@ -126,8 +130,10 @@ def set_query(
self.search_size = search_size
if length_diff:
self.length_diff = length_diff
if use_predict:
self.use_predict = use_predict
if calc_predict:
self.calc_predict = calc_predict
if calc_cosine:
self.calc_cosine = calc_cosine
if number_test_rows:
self.number_test_rows = number_test_rows
if output_path:
Expand Down Expand Up @@ -162,7 +168,8 @@ def __str__(self):
msg += f"selection_threshold:\t{self.selection_threshold}\n"
msg += f"search_size:\t\t{self.search_size}\n"
msg += f"length_diff:\t\t{self.length_diff}\n"
msg += f"use_predict:\t\t{self.use_predict}\n"
msg += f"calc_predict:\t\t{self.calc_predict}\n"
msg += f"calc_cosine:\t\t{self.calc_cosine}\n"
msg += f"number_test_rows:\t{self.number_test_rows}\n"
msg += f"---I/O---\n"
if self.input_file_path in ["default"]:
Expand All @@ -186,7 +193,8 @@ def candidate_ranker(
num_candidates=10,
search_size=4,
length_diff=None,
use_predict=True,
calc_predict=False,
calc_cosine=False,
output_path="ranker_output",
pretrained_model_path=None,
pretrained_vocab_path=None,
Expand Down Expand Up @@ -222,8 +230,10 @@ def candidate_ranker(
number of candidates to be tested at each iteration
length_diff
max length difference allowed between query and candidate strings
use_predict
boolean on whether to use prediction in ranking or not
calc_predict
boolean on whether to calculate prediction (i.e. model inference) or not
calc_cosine
boolean on whether to calculate cosine similarity or not
output_path
path to the output file
pretrained_model_path
Expand Down Expand Up @@ -254,6 +264,11 @@ def candidate_ranker(
# read input file
dl_inputs = read_input_file(input_file_path, verbose)

if not ranking_metric.lower() in ["faiss", "cosine", "conf"]:
sys.exit(
f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. "
"Current ranking methods are: 'faiss', 'cosine', 'conf'"
)
if (ranking_metric.lower() in ["faiss"]) and (selection_threshold < 0):
sys.exit(
f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be >= 0."
Expand All @@ -264,16 +279,14 @@ def candidate_ranker(
sys.exit(
f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be between 0 and 1."
)
if (ranking_metric.lower() in ["conf"]) and use_predict == False:
sys.exit(
f"ranking_metric: {ranking_metric} is selected, but use_predict is set to {use_predict}"
)

if not ranking_metric.lower() in ["faiss", "cosine", "conf"]:
sys.exit(
f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. "
"Current ranking methods are: 'faiss', 'cosine', 'conf'"
)
if (ranking_metric.lower() in ["conf"]) and calc_predict == False:
print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_predict is set to {calc_predict}")
print(f"[WARNING] calc_predict will be set to True.")
calc_predict = True
if (ranking_metric.lower() in ["cosine"]) and calc_cosine == False:
print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_cosine is set to {calc_cosine}")
print(f"[WARNING] calc_cosine will be set to True.")
calc_cosine = True

if num_candidates == 0:
sys.exit(f"[ERROR] num_candidates must be larger than 0.")
Expand Down Expand Up @@ -404,14 +417,18 @@ def candidate_ranker(

query_candidate_pd["label"] = "False"

# Compute cosine similarity
cosine_sim = cosine_similarity(
vecs_query[iq : (iq + 1)].detach().cpu().numpy(),
vecs_candidates.detach().cpu().numpy()[orig_id_candis],
)
cosine_dist = 1.0 - cosine_sim
if calc_cosine:
# Compute cosine similarity
cosine_sim = cosine_similarity(
vecs_query[iq : (iq + 1)].detach().cpu().numpy(),
vecs_candidates.detach().cpu().numpy()[orig_id_candis],
)
cosine_dist = 1.0 - cosine_sim
cosine_dist = cosine_dist[0]
else:
cosine_dist = [None] * len(query_candidate_pd)

if use_predict and (not pretrained_model_path in [False, None]):
if calc_predict and (not pretrained_model_path in [False, None]):
all_preds = candidate_conf_calc(
query_candidate_pd,
model,
Expand All @@ -426,7 +443,7 @@ def candidate_ranker(
query_candidate_pd["faiss_dist"] = found_neighbours[0][
0, id_0_neigh:id_1_neigh
]
query_candidate_pd["cosine_dist"] = cosine_dist[0]
query_candidate_pd["cosine_dist"] = cosine_dist
query_candidate_pd["s1_orig_ids"] = orig_id_queries
query_candidate_pd["s2_orig_ids"] = orig_id_candis

Expand Down Expand Up @@ -527,13 +544,16 @@ def candidate_ranker(
)[:num_candidates]

for i_row, row in collect_neigh_pd.iterrows():
if use_predict == True:
if calc_predict == True:
mydict_dl_match[row["s2_orig"]] = round(row["dl_match"], 4)
mydict_dl_1_minus_match[row["s2_orig"]] = 1.0 - round(
row["dl_match"], 4
)
mydict_faiss_dist[row["s2_orig"]] = round(row["faiss_dist"], 4)
mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4)
if calc_cosine:
mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4)
else:
mydict_cosine_dist[row["s2_orig"]] = row["cosine_dist"]
mydict_candid_id[row["s2_orig"]] = row["s2_orig_ids"]
one_row = {
"id": orig_id_queries,
Expand Down Expand Up @@ -574,7 +594,8 @@ def main():
num_candidates,
search_size,
length_diff,
use_predict,
calc_predict,
calc_cosine,
output_path,
pretrained_model_path,
pretrained_vocab_path,
Expand All @@ -593,7 +614,8 @@ def main():
num_candidates=num_candidates,
search_size=search_size,
length_diff=length_diff,
use_predict=use_predict,
calc_predict=calc_predict,
calc_cosine=calc_cosine,
output_path=output_path,
pretrained_model_path=pretrained_model_path,
pretrained_vocab_path=pretrained_vocab_path,
Expand Down
8 changes: 3 additions & 5 deletions DeezyMatch/tests/test_pipeline_one_col_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ def test_pipeline_one_col_input():
selection_threshold=5.0,
num_candidates=2,
search_size=10,
use_predict=False,
output_path="ranker_results_003/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
Expand All @@ -153,22 +152,21 @@ def test_pipeline_one_col_input():
from DeezyMatch import candidate_ranker

# Select candidates based on L2-norm distance (aka faiss distance)
# where ranking_metric is conf and use_prediction is false:
# where ranking_metric is conf and calc_predict is false:
candidates_pd_predfalse = candidate_ranker(
query_scenario="./combined_003/queries_test",
candidate_scenario="./combined_003/candidates_test",
ranking_metric="faiss",
selection_threshold=5.0,
num_candidates=2,
search_size=10,
use_predict=True,
output_path="ranker_results_003/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
number_test_rows=5,
)

# Same candidates and faiss scores should be retrieved independently of use_predict value:
# Same candidates and faiss scores should be retrieved independently of calc_predict value:
candidates_pd_predtrue.faiss_distance == candidates_pd_predfalse.faiss_distance

from DeezyMatch import candidate_ranker
Expand All @@ -184,7 +182,7 @@ def test_pipeline_one_col_input():
num_candidates=2,
search_size=10,
length_diff=2,
use_predict=True,
calc_predict=True,
output_path="ranker_results_003/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
Expand Down
4 changes: 3 additions & 1 deletion DeezyMatch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,9 @@ def read_command_candidate_ranker():
"-ld", "--length_diff", help="max length difference", default=None
)

parser.add_argument("-up", "--use_predict", help="use predict", default=True)
parser.add_argument("-up", "--calc_predict", help="calculate predict", default=False)

parser.add_argument("-cc", "--calc_cosine", help="calculate cosine", default=False)

parser.add_argument("-o", "--output_path", help="path to output file")

Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -855,8 +855,8 @@ As mentioned, the retrieval of candidates is based on several parameters:
```
:bangbang: In `conf` (i.e., prediction-confidence), the threshold corresponds to the **minimum** accepted value, while in `faiss` and `cosine` metrics, the threshold is the **maximum** accepted value.
:bangbang: The `cosine` and `conf` scores are between [0, 1] while `faiss` distance can take any values from [0, +&#8734;).
* **Use prediction** (`use_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly.
* **Search size** (`search_size`): Unless `use_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities).
* **Calculate prediction** (`calc_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly.
* **Search size** (`search_size`): Unless `calc_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities).
* **Maximum length difference** (`length_diff`): Finally, you can also specify the maximum length difference allowed between the query and the retrieved candidate strings, which may be a useful feature for certain applications.

Finally, **only for testing**, you can use `number_test_rows`. It specifies the number of queries to be used for testing.
Expand All @@ -881,7 +881,8 @@ Summary of the arguments/flags:
| num_candidates | -n | number of desired candidates |
| search_size | -sz | number of candidates to be tested at each iteration |
| length_diff | -ld | max length difference allowed between query and candidate strings |
| use_predict | -up | whether to use prediction in ranking or not |
| calc_predict | -up | whether to calculate prediction (i.e., model inference) or not |
| calc_cosine | -cc | whether to calculate cosine similarity or not |
| output_path | -o | path to the output file |
| pretrained_model_path | -mp | path to the pretrained model |
| pretrained_vocab_path | -v | path to the pretrained vocabulary |
Expand Down
37 changes: 35 additions & 2 deletions examples/example_001.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -234,6 +234,39 @@
"candidates_pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from DeezyMatch import candidate_ranker\n",
"\n",
"# Select candidates based on L2-norm distance (aka faiss distance):\n",
"# find candidates from candidate_scenario \n",
"# for queries specified in query_scenario\n",
"candidates_pd = \\\n",
" candidate_ranker(query_scenario=\"./combined/queries_test\",\n",
" candidate_scenario=\"./combined/candidates_test\", \n",
" ranking_metric=\"cosine\", \n",
" selection_threshold=0.9, \n",
" num_candidates=2, \n",
" search_size=2, \n",
" output_path=\"ranker_results/test_candidates_deezymatch_cosine\", \n",
" pretrained_model_path=\"./models/finetuned_test001/finetuned_test001.model\", \n",
" pretrained_vocab_path=\"./models/finetuned_test001/finetuned_test001.vocab\", \n",
" number_test_rows=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"candidates_pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -391,7 +424,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.9.12"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setuptools.setup(
name="DeezyMatch",
version="1.3.3",
version="1.3.4",
description="A Flexible Deep Learning Approach to Fuzzy String Matching and Candidate Ranking",
author=u"The LwM Development Team",
#author_email="",
Expand Down