diff --git a/.gitignore b/.gitignore index 566bf84..3ab8c64 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ src/tex/figures/* # Miscellaneous __pycache__ *.synctex.gz + +*/.DS_Store diff --git a/Snakefile b/Snakefile index 162e3c5..b404e87 100644 --- a/Snakefile +++ b/Snakefile @@ -14,4 +14,14 @@ rule question_plots: output: "src/tex/figures/question_count_barplot.pdf" script: - "src/scripts/plot_statistics.py" \ No newline at end of file + "src/scripts/plot_statistics.py" + + +rule human_statistics: + input: + "src/scripts/analyze_human_data.py" + output: + ["src/tex/output/number_experts.txt", "src/tex/output/total_hours.txt", "src/text/output/human_timing.pdf"] + # "src/tex/output/human_questions.csv", "src/tex/output/human_questions.pkl" + script: + "src/scripts/compute_human_statistics.py" \ No newline at end of file diff --git a/environment.yml b/environment.yml index 0f07a8b..0e21419 100644 --- a/environment.yml +++ b/environment.yml @@ -4,4 +4,5 @@ dependencies: - pip: - matplotlib - numpy==1.26.3 + - seaborn - git+https://github.com/lamalab-org/chem-bench.git@humanbench \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store index 87278d1..5a6c290 100644 Binary files a/src/.DS_Store and b/src/.DS_Store differ diff --git a/src/scripts/analyze_human_data.py b/src/scripts/analyze_human_data.py new file mode 100644 index 0000000..4b23697 --- /dev/null +++ b/src/scripts/analyze_human_data.py @@ -0,0 +1,80 @@ +from chembench.analysis import load_all_reports +import matplotlib.pyplot as plt +from glob import glob +import pandas as pd +import numpy as np +import seaborn as sns +import os +from paths import scripts, data, output +from plotutils import range_frame +from utils import ( + obtain_chembench_repo, + ONE_COL_WIDTH_INCH, + ONE_COL_GOLDEN_RATIO_HEIGHT_INCH, +) + +plt.style.use(scripts / "lamalab.mplstyle") + + +def make_human_performance_plots(): + chembench = obtain_chembench_repo() + print(chembench) + paths = glob(os.path.join(chembench, "reports/humans/**/*.json"), recursive=True) + print(paths) + dirs = list(set([os.path.dirname(p) for p in paths])) + all_results = [] + for d in dirs: + try: + results = load_all_reports(d, "../data/") + if len(results) < 5: + continue + all_results.append(results) + except Exception as e: + print(e) + continue + + number_humans = len(all_results) + + with open(output / "number_experts.txt", "w") as f: + f.write(f"\SI{{{str(int(number_humans))}}}{{\hour}}") + + long_df = pd.concat(all_results).reset_index(drop=True) + long_df["time_s"] = long_df[("time", 0)] + + total_hours = long_df["time_s"].sum() / 3600 + with open(output / "total_hours.txt", "w") as f: + f.write(str(int(total_hours))) + + make_timing_plot(long_df) + + +def make_timing_plot(long_df): + fig, ax = plt.subplots( + 1, 1, figsize=(ONE_COL_WIDTH_INCH, ONE_COL_GOLDEN_RATIO_HEIGHT_INCH) + ) + sns.violinplot(data=long_df, x="all_correct", y="time_s", cut=0, ax=ax) + sns.stripplot( + data=long_df, + x="all_correct", + y="time_s", + color="black", + ax=ax, + alpha=0.3, + size=2, + ) + + ax.set_yscale("log") + ax.set_ylabel("time / s") + ax.set_xlabel("all correct") + + range_frame( + ax, + np.array([-0.5, 1.5]), + np.array([long_df["time_s"].min(), long_df["time_s"].max()]), + ) + + fig.savefig(output / "human_timing.pdf", bbox_inches="tight") + + +if __name__ == "__main__": + make_human_performance_plots() diff --git a/src/scripts/utils.py b/src/scripts/utils.py index d036451..c3379d6 100644 --- a/src/scripts/utils.py +++ b/src/scripts/utils.py @@ -18,10 +18,11 @@ def obtain_chembench_repo(): if not os.path.exists(os.path.join(data, "chem-bench")): os.chdir(data) os.system("git clone https://github.com/lamalab-org/chem-bench.git --depth 1") - os.system("git checkout -b humanbench") + os.system("git pull origin humanbench") + os.system("git checkout humanbench") else: os.chdir(os.path.join(data, "chem-bench")) - os.system("git checkout -b humanbench") - os.system("git pull") + os.system("git checkout humanbench") + os.system("git pull origin humanbench") os.chdir(cwd) return os.path.join(data, "chem-bench") diff --git a/src/tex/ms.tex b/src/tex/ms.tex index 98fb06b..4806f72 100644 --- a/src/tex/ms.tex +++ b/src/tex/ms.tex @@ -58,7 +58,8 @@ \section{Introduction} Thus, to better understand what \glspl{llm} can do for chemistry and materials science, and where they might be improved with further developments, a comprehensive analysis is needed. For the development of \glspl{llm}, such evaluation is currently mostly performed via standardized benchmarks such as BigBench\cite{srivastava2022beyond} or the LM Eval Harness.\cite{eval-harness} The former contains, among 204 tasks, only two tasks classified as \enquote{chemistry related} whereas the latter contains no specific chemistry tasks. -Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant. +Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} +While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant. While some benchmark based on university entrance exams\cite{Zaki_2024, arora2023llms} or automatic text mining\cite{song2023honeybee, wei2021chemistryqa} have been proposed, also those do not satisfy the following basic criteria chemistry benchmarks should satisfy: \begin{itemize} @@ -81,7 +82,7 @@ \section{Introduction} Our benchmark consists of \variable{output/total_number_of_questions.txt}\unskip question answer pairs manually (\variable{output/manually_generated.txt}\unskip) or semi-automatically (\variable{output/automatically_generated.txt}\unskip) compiled from diverse sources. It covers a large fraction of the topics taught in undergraduate chemistry curricula at various skill levels and can be used with any system that can return text (i.e., also tool-augmented systems). -To contextualize the scores, we also surveyed more than XX experts in chemistry on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans. +To contextualize the scores, we also surveyed more than \variable{output/number_experts.txt}\unskip experts in chemistry (for a total of number of more than \variable{output/number_experts.txt}\unskip) on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans. Our results indicate that current frontier models perform \enquote{superhuman} on some aspects of chemistry but in many cases, included safety-related ones, might be very misleading. @@ -189,7 +190,7 @@ \subsection{Model evaluation workflow} As initial tests indicated that models sometimes return integers in the form of words, e.g. \enquote{one} instead of \enquote{1}, we also implemented a word-to-number conversion. In case these hard-coded parsing steps fail, we fall back to using a \gls{llm}, e.g. Claude-2, to parse the completion. The frequency of this fallback being triggered was very different for different models (see XX). -Manual verification (see XX) indicates that this \gls{llm}-based parsing is not a relevant error source. +Manual verification (see \Cref{sec:manually-verified-parsing}) indicates that this \gls{llm}-based parsing is not a relevant error source. \paragraph{Models} @@ -199,8 +200,8 @@ \subsection{Human baseline} \paragraph{App} To facilitate the collection of responses, we developed a responsive web application in Typescript using the Next.js\cite{nextjs} app router framework. This application handles serving the user interface as well as exposes various \gls{rest} \glspl{api} for relevant operations. We utilize a MySQL\cite{mysql} database and Prisma \gls{orm}\cite{prisma} for efficient database management. -The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses nextAuth for easy and secure user authentication and postMark for sending Emails. - The application is hosted on the Vercel web hosting platform. +The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses NextAuth\cite{nextauth} for easy and secure user authentication and postMark for sending Emails. +The application is hosted on the Vercel web hosting platform. \paragraph{Question selection} Since we anticipated that we will not be able to collect enough responses for every question to allow for a meaningful statistical analysis, we decided on showing a relevant subset of all questions to the human scorers. @@ -260,21 +261,22 @@ \section*{Author contributions} \credit{Amir Elahi}{0,1,0,0,0,0,0,0,0,0,0,0,0,0} \credit{Dinga Wonanke}{0,1,0,0,0,0,0,0,0,0,0,0,0,0} \credit{Juliane Eberhardt}{0,1,0,0,0,0,0,0,0,0,0,0,0,0} +\credit{Christina Glaubitz}{0,1,0,0,0,0,0,0,0,0,0,0,0,0} \credit{Mara~Wilhemli}{0,1,0,0,1,0,0,0,0,0,0,0,0,0} \credit{Macjonathan~Oreke}{0,1,0,0,1,0,0,0,0,0,0,0,0,0} \credit{Benedict~Emoekabu}{0,1,0,0,1,0,0,0,0,0,0,0,0,0} -\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,0,0} +\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,a0,0} \credit{Aswanth~Krishnan}{0,0,0,0,0,0,0,0,1,0,0,0,0,0} \credit{Philippe~Schwaller}{1,1,0,1,1,0,0,0,0,1,0,0,0,0} \credit{Kevin Maik Jablonka}{1,1,1,1,1,1,1,1,1,1,1,1,1,1} -\insertcredits +% \insertcredits \bibliography{references} -\appendix +% \appendix -\subsection{Parsing verification} +\subsection{Parsing verification} \label{sec:manually-verified-parsing} For validating the parsing workflow, we randomly sampled four questions per topic and manually verified that the completions of the model were parsed correctly. \begin{table} @@ -287,4 +289,14 @@ \subsection{Parsing verification} \end{tabularx} \end{table} + +\subsection{Human baseline} + +\begin{figure} + \centering + \includegraphics{figures/human_timing.pdf} + \script{analyze_human_data.py} + \label{fig:human_timing} +\end{figure} + \end{document} diff --git a/src/tex/references.bib b/src/tex/references.bib index 53eea8b..7203413 100644 --- a/src/tex/references.bib +++ b/src/tex/references.bib @@ -300,4 +300,12 @@ @misc{tailwindcss url = {https://tailwindcss.com/}, journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.}, author = {tailwindcss} +} + + +@misc{nextauth, + title = {NextAuth.js}, + url = {https://next-auth.js.org/}, + journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.}, + author = {tailwindcss} } \ No newline at end of file