diff --git a/docs/images/boxplot-naive-vs-biochatter.pdf b/docs/images/boxplot-naive-vs-biochatter.pdf
index dda69e06..95ceb526 100644
Binary files a/docs/images/boxplot-naive-vs-biochatter.pdf and b/docs/images/boxplot-naive-vs-biochatter.pdf differ
diff --git a/docs/images/dotplot-per-task.pdf b/docs/images/dotplot-per-task.pdf
index d4c99e24..f0064561 100644
Binary files a/docs/images/dotplot-per-task.pdf and b/docs/images/dotplot-per-task.pdf differ
diff --git a/docs/images/dotplot-per-task.png b/docs/images/dotplot-per-task.png
index a3add95d..b8ef0493 100644
Binary files a/docs/images/dotplot-per-task.png and b/docs/images/dotplot-per-task.png differ
diff --git a/docs/images/scatter-naive-vs-biochatter.pdf b/docs/images/scatter-naive-vs-biochatter.pdf
new file mode 100644
index 00000000..5aab0275
Binary files /dev/null and b/docs/images/scatter-naive-vs-biochatter.pdf differ
diff --git a/docs/images/scatter-naive-vs-biochatter.png b/docs/images/scatter-naive-vs-biochatter.png
new file mode 100644
index 00000000..ceceda14
Binary files /dev/null and b/docs/images/scatter-naive-vs-biochatter.png differ
diff --git a/docs/images/scatter-per-quantisation-name.pdf b/docs/images/scatter-per-quantisation-name.pdf
index 794df2db..5cf5c0eb 100644
Binary files a/docs/images/scatter-per-quantisation-name.pdf and b/docs/images/scatter-per-quantisation-name.pdf differ
diff --git a/docs/images/scatter-per-quantisation-name.png b/docs/images/scatter-per-quantisation-name.png
index 275d0a61..e045fb5c 100644
Binary files a/docs/images/scatter-per-quantisation-name.png and b/docs/images/scatter-per-quantisation-name.png differ
diff --git a/docs/images/scatter-quantisation-accuracy.pdf b/docs/images/scatter-quantisation-accuracy.pdf
index 7e036aeb..a9f7cac3 100644
Binary files a/docs/images/scatter-quantisation-accuracy.pdf and b/docs/images/scatter-quantisation-accuracy.pdf differ
diff --git a/docs/images/scatter-size-accuracy.pdf b/docs/images/scatter-size-accuracy.pdf
index 4a56accb..d72d8fe2 100644
Binary files a/docs/images/scatter-size-accuracy.pdf and b/docs/images/scatter-size-accuracy.pdf differ
diff --git a/docs/images/stripplot-extraction-tasks.png b/docs/images/stripplot-extraction-tasks.png
index d31f5a52..9918d438 100644
Binary files a/docs/images/stripplot-extraction-tasks.png and b/docs/images/stripplot-extraction-tasks.png differ
diff --git a/docs/images/stripplot-per-model.png b/docs/images/stripplot-per-model.png
index b7879440..d50cf911 100644
Binary files a/docs/images/stripplot-per-model.png and b/docs/images/stripplot-per-model.png differ
diff --git a/docs/images/stripplot-rag-tasks.pdf b/docs/images/stripplot-rag-tasks.pdf
index 87dab66d..749d75a5 100644
Binary files a/docs/images/stripplot-rag-tasks.pdf and b/docs/images/stripplot-rag-tasks.pdf differ
diff --git a/docs/images/stripplot-rag-tasks.png b/docs/images/stripplot-rag-tasks.png
index e2ffb48a..d0149d1c 100644
Binary files a/docs/images/stripplot-rag-tasks.png and b/docs/images/stripplot-rag-tasks.png differ
diff --git a/docs/images/violin-naive-vs-biochatter.pdf b/docs/images/violin-naive-vs-biochatter.pdf
new file mode 100644
index 00000000..264eb30d
Binary files /dev/null and b/docs/images/violin-naive-vs-biochatter.pdf differ
diff --git a/docs/images/violin-naive-vs-biochatter.png b/docs/images/violin-naive-vs-biochatter.png
new file mode 100644
index 00000000..48c53bba
Binary files /dev/null and b/docs/images/violin-naive-vs-biochatter.png differ
diff --git a/docs/scripts/hooks.py b/docs/scripts/hooks.py
index e36ca46b..0456f42e 100644
--- a/docs/scripts/hooks.py
+++ b/docs/scripts/hooks.py
@@ -117,9 +117,7 @@ def plot_text2cypher():
             else (
                 "llama-3"
                 if "llama-3" in x
-                else "gpt"
-                if "gpt" in x
-                else "other open source"
+                else "gpt" if "gpt" in x else "other open source"
             )
         )
     )
@@ -261,9 +259,9 @@ def preprocess_results_for_frontend(
         axis=1,
     )
 
-    aggregated_scores[
-        "Full model name"
-    ] = aggregated_scores.index.get_level_values("model_name")
+    aggregated_scores["Full model name"] = (
+        aggregated_scores.index.get_level_values("model_name")
+    )
     aggregated_scores["Score achieved"] = aggregated_scores["score_achieved"]
     aggregated_scores["Score possible"] = aggregated_scores["score_possible"]
     aggregated_scores["Score SD"] = aggregated_scores["score_sd"]
@@ -333,9 +331,9 @@ def write_individual_extraction_task_results(raw_results: pd.DataFrame) -> None:
         axis=1,
     )
 
-    aggregated_scores[
-        "Full model name"
-    ] = aggregated_scores.index.get_level_values("model_name")
+    aggregated_scores["Full model name"] = (
+        aggregated_scores.index.get_level_values("model_name")
+    )
     aggregated_scores["Subtask"] = aggregated_scores.index.get_level_values(
         "subtask"
     )
@@ -392,9 +390,9 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
     )
 
     overview_per_quantisation = overview
-    overview_per_quantisation[
-        "Full model name"
-    ] = overview_per_quantisation.index
+    overview_per_quantisation["Full model name"] = (
+        overview_per_quantisation.index
+    )
     overview_per_quantisation[
         ["Model name", "Size", "Version", "Quantisation"]
     ] = overview_per_quantisation["Full model name"].str.split(":", expand=True)
@@ -428,9 +426,9 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
         ]
     ]
     # round mean and sd to 2 decimal places
-    overview_per_quantisation.loc[
-        :, "Median Accuracy"
-    ] = overview_per_quantisation["Median Accuracy"].round(2)
+    overview_per_quantisation.loc[:, "Median Accuracy"] = (
+        overview_per_quantisation["Median Accuracy"].round(2)
+    )
     overview_per_quantisation.loc[:, "SD"] = overview_per_quantisation[
         "SD"
     ].round(2)
@@ -878,9 +876,9 @@ def plot_extraction_tasks():
         axis=1,
     )
 
-    aggregated_scores[
-        "Full model name"
-    ] = aggregated_scores.index.get_level_values("model_name")
+    aggregated_scores["Full model name"] = (
+        aggregated_scores.index.get_level_values("model_name")
+    )
     aggregated_scores["Subtask"] = aggregated_scores.index.get_level_values(
         "subtask"
     )
@@ -1093,6 +1091,9 @@ def plot_comparison_naive_biochatter(overview):
         )
     ]
 
+    # print number of rows of each task
+    print(overview_melted["Task"].value_counts())
+
     sns.set_theme(style="whitegrid")
     plt.figure(figsize=(6, 4))
     sns.boxplot(
@@ -1118,6 +1119,56 @@ def plot_comparison_naive_biochatter(overview):
     )
     plt.close()
 
+    # plot scatter plot
+    plt.figure(figsize=(6, 4))
+    sns.stripplot(
+        x="Task",
+        y="Accuracy",
+        data=overview_melted,
+        jitter=0.2,
+        alpha=0.8,
+    )
+    plt.ylim(0, 1)
+    plt.xlabel(None)
+    plt.xticks(
+        ticks=range(len(overview_melted["Task"].unique())),
+        labels=["BioChatter", "Naive LLM (using full YAML schema)"],
+    )
+    plt.savefig(
+        "docs/images/scatter-naive-vs-biochatter.png",
+        bbox_inches="tight",
+        dpi=300,
+    )
+    plt.savefig(
+        "docs/images/scatter-naive-vs-biochatter.pdf",
+        bbox_inches="tight",
+    )
+    plt.close()
+
+    # plit violin plot
+    plt.figure(figsize=(6, 4))
+    sns.violinplot(
+        x="Task",
+        y="Accuracy",
+        data=overview_melted,
+    )
+    plt.ylim(0, 1)
+    plt.xlabel(None)
+    plt.xticks(
+        ticks=range(len(overview_melted["Task"].unique())),
+        labels=["BioChatter", "Naive LLM (using full YAML schema)"],
+    )
+    plt.savefig(
+        "docs/images/violin-naive-vs-biochatter.png",
+        bbox_inches="tight",
+        dpi=300,
+    )
+    plt.savefig(
+        "docs/images/violin-naive-vs-biochatter.pdf",
+        bbox_inches="tight",
+    )
+    plt.close()
+
 
 def calculate_stats(overview):
     overview_melted = melt_and_process(overview)