From b0272f7cdef1078cf2e7f661d5ae186d1a001038 Mon Sep 17 00:00:00 2001
From: Youssef Mohamed <91757835+youssefkhalil320@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:36:18 +0300
Subject: [PATCH] Update EmbeddingSimilarityEvaluator.py to save some examples
 and their scores in a csv file

Add new option to save some examples from the evaluating dataset with their similarity score in a csv file to help the user the see the performance of the model.
---
 .../EmbeddingSimilarityEvaluator.py           | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
index f1fe68911..72506c713 100644
--- a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
+++ b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
@@ -71,6 +71,7 @@ def __init__(
         write_csv: bool = True,
         precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] | None = None,
         truncate_dim: int | None = None,
+        samples_csv_filename: str = "results"
     ):
         """
         Constructs an evaluator based for the dataset.
@@ -97,6 +98,7 @@ def __init__(
         self.write_csv = write_csv
         self.precision = precision
         self.truncate_dim = truncate_dim
+        self.samples_csv_filename = samples_csv_filename
 
         assert len(self.sentences1) == len(self.sentences2)
         assert len(self.sentences1) == len(self.scores)
@@ -174,6 +176,7 @@ def __call__(
                 precision=self.precision,
                 normalize_embeddings=bool(self.precision),
             )
+        
         # Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics
         if self.precision == "binary":
             embeddings1 = (embeddings1 + 128).astype(np.uint8)
@@ -210,6 +213,21 @@ def __call__(
         )
         logger.info(f"Dot-Product-Similarity:\tPearson: {eval_pearson_dot:.4f}\tSpearman: {eval_spearman_dot:.4f}")
 
+        # Print sentences with cosine similarity scores
+        print("\nSentences and their cosine similarity scores:\n")
+        # for sent1, sent2, score in zip(self.sentences1, self.sentences2, cosine_scores):
+        #     print(f"Sentence 1: {sent1}")
+        #     print(f"Sentence 2: {sent2}")
+        #     print(f"Cosine Similarity Score: {score:.4f}\n")
+        with open(self.samples_csv_filename, "w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(["Sentence 1", "Sentence 2", "Cosine Score"])
+            for sent1, sent2, score in zip(self.sentences1, self.sentences2, cosine_scores):
+                writer.writerow([sent1, sent2, score])
+
+        # Print the directory where the CSV file is saved
+        print(f"CSV file saved in: {self.samples_csv_filename}")
+
         if output_path is not None and self.write_csv:
             csv_path = os.path.join(output_path, self.csv_file)
             output_file_exists = os.path.isfile(csv_path)
@@ -257,6 +275,7 @@ def __call__(
         self.store_metrics_in_model_card_data(model, metrics)
         return metrics
 
+
     @property
     def description(self) -> str:
         return "Semantic Similarity"