Skip to content

Commit

Permalink
Merge pull request #136 from Living-with-machines/develop
Browse files Browse the repository at this point in the history
v1.3.4
  • Loading branch information
kasra-hosseini authored Sep 20, 2022
2 parents b47053f + 6c5f536 commit b3e5504
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 44 deletions.
86 changes: 54 additions & 32 deletions DeezyMatch/candidateRanker.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def __init__(
num_candidates=10,
search_size=4,
length_diff=None,
use_predict=True,
calc_predict=False,
calc_cosine=False,
output_path="ranker_output",
pretrained_model_path=None,
pretrained_vocab_path=None,
Expand All @@ -72,7 +73,8 @@ def __init__(
self.num_candidates = num_candidates
self.search_size = search_size
self.length_diff = length_diff
self.use_predict = use_predict
self.calc_predict = calc_predict
self.calc_cosine = calc_cosine
self.output_path = output_path
self.pretrained_model_path = pretrained_model_path
self.pretrained_vocab_path = pretrained_vocab_path
Expand All @@ -91,7 +93,8 @@ def rank(self):
num_candidates=self.num_candidates,
search_size=self.search_size,
length_diff=self.length_diff,
use_predict=self.use_predict,
calc_predict=self.calc_predict,
calc_cosine=self.calc_cosine,
output_path=self.output_path,
pretrained_model_path=self.pretrained_model_path,
pretrained_vocab_path=self.pretrained_vocab_path,
Expand All @@ -108,7 +111,8 @@ def set_query(
num_candidates=None,
search_size=None,
length_diff=None,
use_predict=True,
calc_predict=False,
calc_cosine=False,
number_test_rows=None,
output_path=None,
):
Expand All @@ -126,8 +130,10 @@ def set_query(
self.search_size = search_size
if length_diff:
self.length_diff = length_diff
if use_predict:
self.use_predict = use_predict
if calc_predict:
self.calc_predict = calc_predict
if calc_cosine:
self.calc_cosine = calc_cosine
if number_test_rows:
self.number_test_rows = number_test_rows
if output_path:
Expand Down Expand Up @@ -162,7 +168,8 @@ def __str__(self):
msg += f"selection_threshold:\t{self.selection_threshold}\n"
msg += f"search_size:\t\t{self.search_size}\n"
msg += f"length_diff:\t\t{self.length_diff}\n"
msg += f"use_predict:\t\t{self.use_predict}\n"
msg += f"calc_predict:\t\t{self.calc_predict}\n"
msg += f"calc_cosine:\t\t{self.calc_cosine}\n"
msg += f"number_test_rows:\t{self.number_test_rows}\n"
msg += f"---I/O---\n"
if self.input_file_path in ["default"]:
Expand All @@ -186,7 +193,8 @@ def candidate_ranker(
num_candidates=10,
search_size=4,
length_diff=None,
use_predict=True,
calc_predict=False,
calc_cosine=False,
output_path="ranker_output",
pretrained_model_path=None,
pretrained_vocab_path=None,
Expand Down Expand Up @@ -222,8 +230,10 @@ def candidate_ranker(
number of candidates to be tested at each iteration
length_diff
max length difference allowed between query and candidate strings
use_predict
boolean on whether to use prediction in ranking or not
calc_predict
boolean on whether to calculate prediction (i.e. model inference) or not
calc_cosine
boolean on whether to calculate cosine similarity or not
output_path
path to the output file
pretrained_model_path
Expand Down Expand Up @@ -254,6 +264,11 @@ def candidate_ranker(
# read input file
dl_inputs = read_input_file(input_file_path, verbose)

if not ranking_metric.lower() in ["faiss", "cosine", "conf"]:
sys.exit(
f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. "
"Current ranking methods are: 'faiss', 'cosine', 'conf'"
)
if (ranking_metric.lower() in ["faiss"]) and (selection_threshold < 0):
sys.exit(
f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be >= 0."
Expand All @@ -264,16 +279,14 @@ def candidate_ranker(
sys.exit(
f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be between 0 and 1."
)
if (ranking_metric.lower() in ["conf"]) and use_predict == False:
sys.exit(
f"ranking_metric: {ranking_metric} is selected, but use_predict is set to {use_predict}"
)

if not ranking_metric.lower() in ["faiss", "cosine", "conf"]:
sys.exit(
f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. "
"Current ranking methods are: 'faiss', 'cosine', 'conf'"
)
if (ranking_metric.lower() in ["conf"]) and calc_predict == False:
print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_predict is set to {calc_predict}")
print(f"[WARNING] calc_predict will be set to True.")
calc_predict = True
if (ranking_metric.lower() in ["cosine"]) and calc_cosine == False:
print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_cosine is set to {calc_cosine}")
print(f"[WARNING] calc_cosine will be set to True.")
calc_cosine = True

if num_candidates == 0:
sys.exit(f"[ERROR] num_candidates must be larger than 0.")
Expand Down Expand Up @@ -404,14 +417,18 @@ def candidate_ranker(

query_candidate_pd["label"] = "False"

# Compute cosine similarity
cosine_sim = cosine_similarity(
vecs_query[iq : (iq + 1)].detach().cpu().numpy(),
vecs_candidates.detach().cpu().numpy()[orig_id_candis],
)
cosine_dist = 1.0 - cosine_sim
if calc_cosine:
# Compute cosine similarity
cosine_sim = cosine_similarity(
vecs_query[iq : (iq + 1)].detach().cpu().numpy(),
vecs_candidates.detach().cpu().numpy()[orig_id_candis],
)
cosine_dist = 1.0 - cosine_sim
cosine_dist = cosine_dist[0]
else:
cosine_dist = [None] * len(query_candidate_pd)

if use_predict and (not pretrained_model_path in [False, None]):
if calc_predict and (not pretrained_model_path in [False, None]):
all_preds = candidate_conf_calc(
query_candidate_pd,
model,
Expand All @@ -426,7 +443,7 @@ def candidate_ranker(
query_candidate_pd["faiss_dist"] = found_neighbours[0][
0, id_0_neigh:id_1_neigh
]
query_candidate_pd["cosine_dist"] = cosine_dist[0]
query_candidate_pd["cosine_dist"] = cosine_dist
query_candidate_pd["s1_orig_ids"] = orig_id_queries
query_candidate_pd["s2_orig_ids"] = orig_id_candis

Expand Down Expand Up @@ -527,13 +544,16 @@ def candidate_ranker(
)[:num_candidates]

for i_row, row in collect_neigh_pd.iterrows():
if use_predict == True:
if calc_predict == True:
mydict_dl_match[row["s2_orig"]] = round(row["dl_match"], 4)
mydict_dl_1_minus_match[row["s2_orig"]] = 1.0 - round(
row["dl_match"], 4
)
mydict_faiss_dist[row["s2_orig"]] = round(row["faiss_dist"], 4)
mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4)
if calc_cosine:
mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4)
else:
mydict_cosine_dist[row["s2_orig"]] = row["cosine_dist"]
mydict_candid_id[row["s2_orig"]] = row["s2_orig_ids"]
one_row = {
"id": orig_id_queries,
Expand Down Expand Up @@ -574,7 +594,8 @@ def main():
num_candidates,
search_size,
length_diff,
use_predict,
calc_predict,
calc_cosine,
output_path,
pretrained_model_path,
pretrained_vocab_path,
Expand All @@ -593,7 +614,8 @@ def main():
num_candidates=num_candidates,
search_size=search_size,
length_diff=length_diff,
use_predict=use_predict,
calc_predict=calc_predict,
calc_cosine=calc_cosine,
output_path=output_path,
pretrained_model_path=pretrained_model_path,
pretrained_vocab_path=pretrained_vocab_path,
Expand Down
8 changes: 3 additions & 5 deletions DeezyMatch/tests/test_pipeline_one_col_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ def test_pipeline_one_col_input():
selection_threshold=5.0,
num_candidates=2,
search_size=10,
use_predict=False,
output_path="ranker_results_003/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
Expand All @@ -153,22 +152,21 @@ def test_pipeline_one_col_input():
from DeezyMatch import candidate_ranker

# Select candidates based on L2-norm distance (aka faiss distance)
# where ranking_metric is conf and use_prediction is false:
# where ranking_metric is conf and calc_predict is false:
candidates_pd_predfalse = candidate_ranker(
query_scenario="./combined_003/queries_test",
candidate_scenario="./combined_003/candidates_test",
ranking_metric="faiss",
selection_threshold=5.0,
num_candidates=2,
search_size=10,
use_predict=True,
output_path="ranker_results_003/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
number_test_rows=5,
)

# Same candidates and faiss scores should be retrieved independently of use_predict value:
# Same candidates and faiss scores should be retrieved independently of calc_predict value:
candidates_pd_predtrue.faiss_distance == candidates_pd_predfalse.faiss_distance

from DeezyMatch import candidate_ranker
Expand All @@ -184,7 +182,7 @@ def test_pipeline_one_col_input():
num_candidates=2,
search_size=10,
length_diff=2,
use_predict=True,
calc_predict=True,
output_path="ranker_results_003/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
Expand Down
4 changes: 3 additions & 1 deletion DeezyMatch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,9 @@ def read_command_candidate_ranker():
"-ld", "--length_diff", help="max length difference", default=None
)

parser.add_argument("-up", "--use_predict", help="use predict", default=True)
parser.add_argument("-up", "--calc_predict", help="calculate predict", default=False)

parser.add_argument("-cc", "--calc_cosine", help="calculate cosine", default=False)

parser.add_argument("-o", "--output_path", help="path to output file")

Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -855,8 +855,8 @@ As mentioned, the retrieval of candidates is based on several parameters:
```
:bangbang: In `conf` (i.e., prediction-confidence), the threshold corresponds to the **minimum** accepted value, while in `faiss` and `cosine` metrics, the threshold is the **maximum** accepted value.
:bangbang: The `cosine` and `conf` scores are between [0, 1] while `faiss` distance can take any values from [0, +&#8734;).
* **Use prediction** (`use_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly.
* **Search size** (`search_size`): Unless `use_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities).
* **Calculate prediction** (`calc_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly.
* **Search size** (`search_size`): Unless `calc_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities).
* **Maximum length difference** (`length_diff`): Finally, you can also specify the maximum length difference allowed between the query and the retrieved candidate strings, which may be a useful feature for certain applications.
Finally, **only for testing**, you can use `number_test_rows`. It specifies the number of queries to be used for testing.
Expand All @@ -881,7 +881,8 @@ Summary of the arguments/flags:
| num_candidates | -n | number of desired candidates |
| search_size | -sz | number of candidates to be tested at each iteration |
| length_diff | -ld | max length difference allowed between query and candidate strings |
| use_predict | -up | whether to use prediction in ranking or not |
| calc_predict | -up | whether to calculate prediction (i.e., model inference) or not |
| calc_cosine | -cc | whether to calculate cosine similarity or not |
| output_path | -o | path to the output file |
| pretrained_model_path | -mp | path to the pretrained model |
| pretrained_vocab_path | -v | path to the pretrained vocabulary |
Expand Down
37 changes: 35 additions & 2 deletions examples/example_001.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -234,6 +234,39 @@
"candidates_pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from DeezyMatch import candidate_ranker\n",
"\n",
"# Select candidates based on L2-norm distance (aka faiss distance):\n",
"# find candidates from candidate_scenario \n",
"# for queries specified in query_scenario\n",
"candidates_pd = \\\n",
" candidate_ranker(query_scenario=\"./combined/queries_test\",\n",
" candidate_scenario=\"./combined/candidates_test\", \n",
" ranking_metric=\"cosine\", \n",
" selection_threshold=0.9, \n",
" num_candidates=2, \n",
" search_size=2, \n",
" output_path=\"ranker_results/test_candidates_deezymatch_cosine\", \n",
" pretrained_model_path=\"./models/finetuned_test001/finetuned_test001.model\", \n",
" pretrained_vocab_path=\"./models/finetuned_test001/finetuned_test001.vocab\", \n",
" number_test_rows=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"candidates_pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -391,7 +424,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.9.12"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setuptools.setup(
name="DeezyMatch",
version="1.3.3",
version="1.3.4",
description="A Flexible Deep Learning Approach to Fuzzy String Matching and Candidate Ranking",
author=u"The LwM Development Team",
#author_email="",
Expand Down

0 comments on commit b3e5504

Please sign in to comment.