From b0272f7cdef1078cf2e7f661d5ae186d1a001038 Mon Sep 17 00:00:00 2001 From: Youssef Mohamed <91757835+youssefkhalil320@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:36:18 +0300 Subject: [PATCH] Update EmbeddingSimilarityEvaluator.py to save some examples and their scores in a csv file Add new option to save some examples from the evaluating dataset with their similarity score in a csv file to help the user the see the performance of the model. --- .../EmbeddingSimilarityEvaluator.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py index f1fe68911..72506c713 100644 --- a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py +++ b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py @@ -71,6 +71,7 @@ def __init__( write_csv: bool = True, precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] | None = None, truncate_dim: int | None = None, + samples_csv_filename: str = "results" ): """ Constructs an evaluator based for the dataset. @@ -97,6 +98,7 @@ def __init__( self.write_csv = write_csv self.precision = precision self.truncate_dim = truncate_dim + self.samples_csv_filename = samples_csv_filename assert len(self.sentences1) == len(self.sentences2) assert len(self.sentences1) == len(self.scores) @@ -174,6 +176,7 @@ def __call__( precision=self.precision, normalize_embeddings=bool(self.precision), ) + # Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics if self.precision == "binary": embeddings1 = (embeddings1 + 128).astype(np.uint8) @@ -210,6 +213,21 @@ def __call__( ) logger.info(f"Dot-Product-Similarity:\tPearson: {eval_pearson_dot:.4f}\tSpearman: {eval_spearman_dot:.4f}") + # Print sentences with cosine similarity scores + print("\nSentences and their cosine similarity scores:\n") + # for sent1, sent2, score in zip(self.sentences1, self.sentences2, cosine_scores): + # print(f"Sentence 1: {sent1}") + # print(f"Sentence 2: {sent2}") + # print(f"Cosine Similarity Score: {score:.4f}\n") + with open(self.samples_csv_filename, "w", newline="", encoding="utf-8") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["Sentence 1", "Sentence 2", "Cosine Score"]) + for sent1, sent2, score in zip(self.sentences1, self.sentences2, cosine_scores): + writer.writerow([sent1, sent2, score]) + + # Print the directory where the CSV file is saved + print(f"CSV file saved in: {self.samples_csv_filename}") + if output_path is not None and self.write_csv: csv_path = os.path.join(output_path, self.csv_file) output_file_exists = os.path.isfile(csv_path) @@ -257,6 +275,7 @@ def __call__( self.store_metrics_in_model_card_data(model, metrics) return metrics + @property def description(self) -> str: return "Semantic Similarity"