updates

lamalab-org · Mar 6, 2024 · 92f439c · 92f439c
1 parent 8a0456e
commit 92f439c
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,5 @@ src/tex/figures/*
 # Miscellaneous
 __pycache__
 *.synctex.gz
+
+*/.DS_Store
diff --git a/Snakefile b/Snakefile
@@ -14,4 +14,14 @@ rule question_plots:
  output: 
  "src/tex/figures/question_count_barplot.pdf"
  script: 
- "src/scripts/plot_statistics.py" 
+ "src/scripts/plot_statistics.py" 
+
+
+rule human_statistics: 
+ input: 
+ "src/scripts/analyze_human_data.py"
+ output: 
+ ["src/tex/output/number_experts.txt", "src/tex/output/total_hours.txt", "src/text/output/human_timing.pdf"]
+ # "src/tex/output/human_questions.csv", "src/tex/output/human_questions.pkl"
+ script: 
+ "src/scripts/compute_human_statistics.py"
diff --git a/environment.yml b/environment.yml
@@ -4,4 +4,5 @@ dependencies:
  - pip:
  - matplotlib
  - numpy==1.26.3
+ - seaborn
  - git+https://github.com/lamalab-org/chem-bench.git@humanbench
diff --git a/src/.DS_Store b/src/.DS_Store
diff --git a/src/scripts/analyze_human_data.py b/src/scripts/analyze_human_data.py
@@ -0,0 +1,80 @@
+from chembench.analysis import load_all_reports
+import matplotlib.pyplot as plt
+from glob import glob
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import os
+from paths import scripts, data, output
+from plotutils import range_frame
+from utils import (
+ obtain_chembench_repo,
+ ONE_COL_WIDTH_INCH,
+ ONE_COL_GOLDEN_RATIO_HEIGHT_INCH,
+)
+
+plt.style.use(scripts / "lamalab.mplstyle")
+
+
+def make_human_performance_plots():
+ chembench = obtain_chembench_repo()
+ print(chembench)
+ paths = glob(os.path.join(chembench, "reports/humans/**/*.json"), recursive=True)
+ print(paths)
+ dirs = list(set([os.path.dirname(p) for p in paths]))
+ all_results = []
+ for d in dirs:
+ try:
+ results = load_all_reports(d, "../data/")
+ if len(results) < 5:
+ continue
+ all_results.append(results)
+ except Exception as e:
+ print(e)
+ continue
+
+ number_humans = len(all_results)
+
+ with open(output / "number_experts.txt", "w") as f:
+ f.write(f"\SI{{{str(int(number_humans))}}}{{\hour}}")
+
+ long_df = pd.concat(all_results).reset_index(drop=True)
+ long_df["time_s"] = long_df[("time", 0)]
+
+ total_hours = long_df["time_s"].sum() / 3600
+ with open(output / "total_hours.txt", "w") as f:
+ f.write(str(int(total_hours)))
+
+ make_timing_plot(long_df)
+
+
+def make_timing_plot(long_df):
+ fig, ax = plt.subplots(
+ 1, 1, figsize=(ONE_COL_WIDTH_INCH, ONE_COL_GOLDEN_RATIO_HEIGHT_INCH)
+ )
+ sns.violinplot(data=long_df, x="all_correct", y="time_s", cut=0, ax=ax)
+ sns.stripplot(
+ data=long_df,
+ x="all_correct",
+ y="time_s",
+ color="black",
+ ax=ax,
+ alpha=0.3,
+ size=2,
+ )
+
+ ax.set_yscale("log")
+ ax.set_ylabel("time / s")
+ ax.set_xlabel("all correct")
+
+ range_frame(
+ ax,
+ np.array([-0.5, 1.5]),
+ np.array([long_df["time_s"].min(), long_df["time_s"].max()]),
+ )
+
+ fig.savefig(output / "human_timing.pdf", bbox_inches="tight")
+
+
+if __name__ == "__main__":
+ make_human_performance_plots()
diff --git a/src/scripts/utils.py b/src/scripts/utils.py
@@ -18,10 +18,11 @@ def obtain_chembench_repo():
  if not os.path.exists(os.path.join(data, "chem-bench")):
  os.chdir(data)
  os.system("git clone https://github.com/lamalab-org/chem-bench.git --depth 1")
- os.system("git checkout -b humanbench")
+ os.system("git pull origin humanbench")
+ os.system("git checkout humanbench")
  else:
  os.chdir(os.path.join(data, "chem-bench"))
- os.system("git checkout -b humanbench")
- os.system("git pull")
+ os.system("git checkout humanbench")
+ os.system("git pull origin humanbench")
  os.chdir(cwd)
  return os.path.join(data, "chem-bench")
diff --git a/src/tex/ms.tex b/src/tex/ms.tex
@@ -58,7 +58,8 @@ \section{Introduction}
 Thus, to better understand what \glspl{llm} can do for chemistry and materials science, and where they might be improved with further developments, a comprehensive analysis is needed. 
 For the development of \glspl{llm}, such evaluation is currently mostly performed via standardized benchmarks such as BigBench\cite{srivastava2022beyond} or the LM Eval Harness.\cite{eval-harness} 
 The former contains, among 204 tasks, only two tasks classified as \enquote{chemistry related} whereas the latter contains no specific chemistry tasks. 
-Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant. 
+Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} 
+While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant. 
 
 While some benchmark based on university entrance exams\cite{Zaki_2024, arora2023llms} or automatic text mining\cite{song2023honeybee, wei2021chemistryqa} have been proposed, also those do not satisfy the following basic criteria chemistry benchmarks should satisfy: 
 \begin{itemize} 
@@ -81,7 +82,7 @@ \section{Introduction}
 Our benchmark consists of \variable{output/total_number_of_questions.txt}\unskip question answer pairs manually (\variable{output/manually_generated.txt}\unskip) or semi-automatically (\variable{output/automatically_generated.txt}\unskip) compiled from diverse sources. 
 It covers a large fraction of the topics taught in undergraduate chemistry curricula at various skill levels and can be used with any system that can return text (i.e., also tool-augmented systems).
 
-To contextualize the scores, we also surveyed more than XX experts in chemistry on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans.
+To contextualize the scores, we also surveyed more than \variable{output/number_experts.txt}\unskip experts in chemistry (for a total of number of more than \variable{output/number_experts.txt}\unskip) on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans.
 Our results indicate that current frontier models perform \enquote{superhuman} on some aspects of chemistry but in many cases, included safety-related ones, might be very misleading. 
 
 
@@ -189,7 +190,7 @@ \subsection{Model evaluation workflow}
 As initial tests indicated that models sometimes return integers in the form of words, e.g. \enquote{one} instead of \enquote{1}, we also implemented a word-to-number conversion. 
 In case these hard-coded parsing steps fail, we fall back to using a \gls{llm}, e.g. Claude-2, to parse the completion. 
 The frequency of this fallback being triggered was very different for different models (see XX). 
-Manual verification (see XX) indicates that this \gls{llm}-based parsing is not a relevant error source.
+Manual verification (see \Cref{sec:manually-verified-parsing}) indicates that this \gls{llm}-based parsing is not a relevant error source.
 
 \paragraph{Models}
 
@@ -199,8 +200,8 @@ \subsection{Human baseline}
 \paragraph{App} To facilitate the collection of responses, we developed a responsive web application in Typescript using the Next.js\cite{nextjs} app router framework. 
 This application handles serving the user interface as well as exposes various \gls{rest} \glspl{api} for relevant operations. 
 We utilize a MySQL\cite{mysql} database and Prisma \gls{orm}\cite{prisma} for efficient database management. 
-The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses nextAuth for easy and secure user authentication and postMark for sending Emails.
- The application is hosted on the Vercel web hosting platform. 
+The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses NextAuth\cite{nextauth} for easy and secure user authentication and postMark for sending Emails.
+The application is hosted on the Vercel web hosting platform. 
 
 \paragraph{Question selection}
 Since we anticipated that we will not be able to collect enough responses for every question to allow for a meaningful statistical analysis, we decided on showing a relevant subset of all questions to the human scorers. 
@@ -260,21 +261,22 @@ \section*{Author contributions}
 \credit{Amir Elahi}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
 \credit{Dinga Wonanke}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
 \credit{Juliane Eberhardt}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
+\credit{Christina Glaubitz}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
 \credit{Mara~Wilhemli}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
 \credit{Macjonathan~Oreke}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
 \credit{Benedict~Emoekabu}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
-\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,0,0}
+\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,a0,0}
 \credit{Aswanth~Krishnan}{0,0,0,0,0,0,0,0,1,0,0,0,0,0}
 \credit{Philippe~Schwaller}{1,1,0,1,1,0,0,0,0,1,0,0,0,0}
 \credit{Kevin Maik Jablonka}{1,1,1,1,1,1,1,1,1,1,1,1,1,1}
 
-\insertcredits
+% \insertcredits
 
 \bibliography{references}
 
-\appendix
+% \appendix
 
-\subsection{Parsing verification}
+\subsection{Parsing verification} \label{sec:manually-verified-parsing}
 For validating the parsing workflow, we randomly sampled four questions per topic and manually verified that the completions of the model were parsed correctly. 
 
 \begin{table}
@@ -287,4 +289,14 @@ \subsection{Parsing verification}
  \end{tabularx}
 \end{table}
 
+
+\subsection{Human baseline}
+
+\begin{figure}
+ \centering 
+ \includegraphics{figures/human_timing.pdf}
+ \script{analyze_human_data.py}
+ \label{fig:human_timing}
+\end{figure}
+
 \end{document} 
diff --git a/src/tex/references.bib b/src/tex/references.bib
@@ -300,4 +300,12 @@ @misc{tailwindcss
  url = {https://tailwindcss.com/},
  journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.},
  author = {tailwindcss}
+} 
+
+
+@misc{nextauth,
+ title = {NextAuth.js},
+ url = {https://next-auth.js.org/},
+ journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.},
+ author = {tailwindcss}
 }