diff --git a/docs/images/boxplot-naive-vs-biochatter.pdf b/docs/images/boxplot-naive-vs-biochatter.pdf index dda69e06..95ceb526 100644 Binary files a/docs/images/boxplot-naive-vs-biochatter.pdf and b/docs/images/boxplot-naive-vs-biochatter.pdf differ diff --git a/docs/images/dotplot-per-task.pdf b/docs/images/dotplot-per-task.pdf index d4c99e24..f0064561 100644 Binary files a/docs/images/dotplot-per-task.pdf and b/docs/images/dotplot-per-task.pdf differ diff --git a/docs/images/dotplot-per-task.png b/docs/images/dotplot-per-task.png index a3add95d..b8ef0493 100644 Binary files a/docs/images/dotplot-per-task.png and b/docs/images/dotplot-per-task.png differ diff --git a/docs/images/scatter-naive-vs-biochatter.pdf b/docs/images/scatter-naive-vs-biochatter.pdf new file mode 100644 index 00000000..5aab0275 Binary files /dev/null and b/docs/images/scatter-naive-vs-biochatter.pdf differ diff --git a/docs/images/scatter-naive-vs-biochatter.png b/docs/images/scatter-naive-vs-biochatter.png new file mode 100644 index 00000000..ceceda14 Binary files /dev/null and b/docs/images/scatter-naive-vs-biochatter.png differ diff --git a/docs/images/scatter-per-quantisation-name.pdf b/docs/images/scatter-per-quantisation-name.pdf index 794df2db..5cf5c0eb 100644 Binary files a/docs/images/scatter-per-quantisation-name.pdf and b/docs/images/scatter-per-quantisation-name.pdf differ diff --git a/docs/images/scatter-per-quantisation-name.png b/docs/images/scatter-per-quantisation-name.png index 275d0a61..e045fb5c 100644 Binary files a/docs/images/scatter-per-quantisation-name.png and b/docs/images/scatter-per-quantisation-name.png differ diff --git a/docs/images/scatter-quantisation-accuracy.pdf b/docs/images/scatter-quantisation-accuracy.pdf index 7e036aeb..a9f7cac3 100644 Binary files a/docs/images/scatter-quantisation-accuracy.pdf and b/docs/images/scatter-quantisation-accuracy.pdf differ diff --git a/docs/images/scatter-size-accuracy.pdf b/docs/images/scatter-size-accuracy.pdf index 4a56accb..d72d8fe2 100644 Binary files a/docs/images/scatter-size-accuracy.pdf and b/docs/images/scatter-size-accuracy.pdf differ diff --git a/docs/images/stripplot-extraction-tasks.png b/docs/images/stripplot-extraction-tasks.png index d31f5a52..9918d438 100644 Binary files a/docs/images/stripplot-extraction-tasks.png and b/docs/images/stripplot-extraction-tasks.png differ diff --git a/docs/images/stripplot-per-model.png b/docs/images/stripplot-per-model.png index b7879440..d50cf911 100644 Binary files a/docs/images/stripplot-per-model.png and b/docs/images/stripplot-per-model.png differ diff --git a/docs/images/stripplot-rag-tasks.pdf b/docs/images/stripplot-rag-tasks.pdf index 87dab66d..749d75a5 100644 Binary files a/docs/images/stripplot-rag-tasks.pdf and b/docs/images/stripplot-rag-tasks.pdf differ diff --git a/docs/images/stripplot-rag-tasks.png b/docs/images/stripplot-rag-tasks.png index e2ffb48a..d0149d1c 100644 Binary files a/docs/images/stripplot-rag-tasks.png and b/docs/images/stripplot-rag-tasks.png differ diff --git a/docs/images/violin-naive-vs-biochatter.pdf b/docs/images/violin-naive-vs-biochatter.pdf new file mode 100644 index 00000000..264eb30d Binary files /dev/null and b/docs/images/violin-naive-vs-biochatter.pdf differ diff --git a/docs/images/violin-naive-vs-biochatter.png b/docs/images/violin-naive-vs-biochatter.png new file mode 100644 index 00000000..48c53bba Binary files /dev/null and b/docs/images/violin-naive-vs-biochatter.png differ diff --git a/docs/scripts/hooks.py b/docs/scripts/hooks.py index e36ca46b..0456f42e 100644 --- a/docs/scripts/hooks.py +++ b/docs/scripts/hooks.py @@ -117,9 +117,7 @@ def plot_text2cypher(): else ( "llama-3" if "llama-3" in x - else "gpt" - if "gpt" in x - else "other open source" + else "gpt" if "gpt" in x else "other open source" ) ) ) @@ -261,9 +259,9 @@ def preprocess_results_for_frontend( axis=1, ) - aggregated_scores[ - "Full model name" - ] = aggregated_scores.index.get_level_values("model_name") + aggregated_scores["Full model name"] = ( + aggregated_scores.index.get_level_values("model_name") + ) aggregated_scores["Score achieved"] = aggregated_scores["score_achieved"] aggregated_scores["Score possible"] = aggregated_scores["score_possible"] aggregated_scores["Score SD"] = aggregated_scores["score_sd"] @@ -333,9 +331,9 @@ def write_individual_extraction_task_results(raw_results: pd.DataFrame) -> None: axis=1, ) - aggregated_scores[ - "Full model name" - ] = aggregated_scores.index.get_level_values("model_name") + aggregated_scores["Full model name"] = ( + aggregated_scores.index.get_level_values("model_name") + ) aggregated_scores["Subtask"] = aggregated_scores.index.get_level_values( "subtask" ) @@ -392,9 +390,9 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]): ) overview_per_quantisation = overview - overview_per_quantisation[ - "Full model name" - ] = overview_per_quantisation.index + overview_per_quantisation["Full model name"] = ( + overview_per_quantisation.index + ) overview_per_quantisation[ ["Model name", "Size", "Version", "Quantisation"] ] = overview_per_quantisation["Full model name"].str.split(":", expand=True) @@ -428,9 +426,9 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]): ] ] # round mean and sd to 2 decimal places - overview_per_quantisation.loc[ - :, "Median Accuracy" - ] = overview_per_quantisation["Median Accuracy"].round(2) + overview_per_quantisation.loc[:, "Median Accuracy"] = ( + overview_per_quantisation["Median Accuracy"].round(2) + ) overview_per_quantisation.loc[:, "SD"] = overview_per_quantisation[ "SD" ].round(2) @@ -878,9 +876,9 @@ def plot_extraction_tasks(): axis=1, ) - aggregated_scores[ - "Full model name" - ] = aggregated_scores.index.get_level_values("model_name") + aggregated_scores["Full model name"] = ( + aggregated_scores.index.get_level_values("model_name") + ) aggregated_scores["Subtask"] = aggregated_scores.index.get_level_values( "subtask" ) @@ -1093,6 +1091,9 @@ def plot_comparison_naive_biochatter(overview): ) ] + # print number of rows of each task + print(overview_melted["Task"].value_counts()) + sns.set_theme(style="whitegrid") plt.figure(figsize=(6, 4)) sns.boxplot( @@ -1118,6 +1119,56 @@ def plot_comparison_naive_biochatter(overview): ) plt.close() + # plot scatter plot + plt.figure(figsize=(6, 4)) + sns.stripplot( + x="Task", + y="Accuracy", + data=overview_melted, + jitter=0.2, + alpha=0.8, + ) + plt.ylim(0, 1) + plt.xlabel(None) + plt.xticks( + ticks=range(len(overview_melted["Task"].unique())), + labels=["BioChatter", "Naive LLM (using full YAML schema)"], + ) + plt.savefig( + "docs/images/scatter-naive-vs-biochatter.png", + bbox_inches="tight", + dpi=300, + ) + plt.savefig( + "docs/images/scatter-naive-vs-biochatter.pdf", + bbox_inches="tight", + ) + plt.close() + + # plit violin plot + plt.figure(figsize=(6, 4)) + sns.violinplot( + x="Task", + y="Accuracy", + data=overview_melted, + ) + plt.ylim(0, 1) + plt.xlabel(None) + plt.xticks( + ticks=range(len(overview_melted["Task"].unique())), + labels=["BioChatter", "Naive LLM (using full YAML schema)"], + ) + plt.savefig( + "docs/images/violin-naive-vs-biochatter.png", + bbox_inches="tight", + dpi=300, + ) + plt.savefig( + "docs/images/violin-naive-vs-biochatter.pdf", + bbox_inches="tight", + ) + plt.close() + def calculate_stats(overview): overview_melted = melt_and_process(overview)