Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
kjappelbaum committed Mar 6, 2024
1 parent 8a0456e commit 92f439c
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 13 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ src/tex/figures/*
# Miscellaneous
__pycache__
*.synctex.gz

*/.DS_Store
12 changes: 11 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,14 @@ rule question_plots:
output:
"src/tex/figures/question_count_barplot.pdf"
script:
"src/scripts/plot_statistics.py"
"src/scripts/plot_statistics.py"


rule human_statistics:
input:
"src/scripts/analyze_human_data.py"
output:
["src/tex/output/number_experts.txt", "src/tex/output/total_hours.txt", "src/text/output/human_timing.pdf"]
# "src/tex/output/human_questions.csv", "src/tex/output/human_questions.pkl"
script:
"src/scripts/compute_human_statistics.py"
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ dependencies:
- pip:
- matplotlib
- numpy==1.26.3
- seaborn
- git+https://github.com/lamalab-org/chem-bench.git@humanbench
Binary file modified src/.DS_Store
Binary file not shown.
80 changes: 80 additions & 0 deletions src/scripts/analyze_human_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from chembench.analysis import load_all_reports
import matplotlib.pyplot as plt
from glob import glob
import pandas as pd
import numpy as np
import seaborn as sns
import os
from paths import scripts, data, output
from plotutils import range_frame
from utils import (
obtain_chembench_repo,
ONE_COL_WIDTH_INCH,
ONE_COL_GOLDEN_RATIO_HEIGHT_INCH,
)

plt.style.use(scripts / "lamalab.mplstyle")


def make_human_performance_plots():
chembench = obtain_chembench_repo()
print(chembench)
paths = glob(os.path.join(chembench, "reports/humans/**/*.json"), recursive=True)
print(paths)
dirs = list(set([os.path.dirname(p) for p in paths]))
all_results = []
for d in dirs:
try:
results = load_all_reports(d, "../data/")
if len(results) < 5:
continue
all_results.append(results)
except Exception as e:
print(e)
continue

number_humans = len(all_results)

with open(output / "number_experts.txt", "w") as f:
f.write(f"\SI{{{str(int(number_humans))}}}{{\hour}}")

long_df = pd.concat(all_results).reset_index(drop=True)
long_df["time_s"] = long_df[("time", 0)]

total_hours = long_df["time_s"].sum() / 3600
with open(output / "total_hours.txt", "w") as f:
f.write(str(int(total_hours)))

make_timing_plot(long_df)


def make_timing_plot(long_df):
fig, ax = plt.subplots(
1, 1, figsize=(ONE_COL_WIDTH_INCH, ONE_COL_GOLDEN_RATIO_HEIGHT_INCH)
)
sns.violinplot(data=long_df, x="all_correct", y="time_s", cut=0, ax=ax)
sns.stripplot(
data=long_df,
x="all_correct",
y="time_s",
color="black",
ax=ax,
alpha=0.3,
size=2,
)

ax.set_yscale("log")
ax.set_ylabel("time / s")
ax.set_xlabel("all correct")

range_frame(
ax,
np.array([-0.5, 1.5]),
np.array([long_df["time_s"].min(), long_df["time_s"].max()]),
)

fig.savefig(output / "human_timing.pdf", bbox_inches="tight")


if __name__ == "__main__":
make_human_performance_plots()
7 changes: 4 additions & 3 deletions src/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ def obtain_chembench_repo():
if not os.path.exists(os.path.join(data, "chem-bench")):
os.chdir(data)
os.system("git clone https://github.com/lamalab-org/chem-bench.git --depth 1")
os.system("git checkout -b humanbench")
os.system("git pull origin humanbench")
os.system("git checkout humanbench")
else:
os.chdir(os.path.join(data, "chem-bench"))
os.system("git checkout -b humanbench")
os.system("git pull")
os.system("git checkout humanbench")
os.system("git pull origin humanbench")
os.chdir(cwd)
return os.path.join(data, "chem-bench")
30 changes: 21 additions & 9 deletions src/tex/ms.tex
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ \section{Introduction}
Thus, to better understand what \glspl{llm} can do for chemistry and materials science, and where they might be improved with further developments, a comprehensive analysis is needed.
For the development of \glspl{llm}, such evaluation is currently mostly performed via standardized benchmarks such as BigBench\cite{srivastava2022beyond} or the LM Eval Harness.\cite{eval-harness}
The former contains, among 204 tasks, only two tasks classified as \enquote{chemistry related} whereas the latter contains no specific chemistry tasks.
Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant.
Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking}
While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant.

While some benchmark based on university entrance exams\cite{Zaki_2024, arora2023llms} or automatic text mining\cite{song2023honeybee, wei2021chemistryqa} have been proposed, also those do not satisfy the following basic criteria chemistry benchmarks should satisfy:
\begin{itemize}
Expand All @@ -81,7 +82,7 @@ \section{Introduction}
Our benchmark consists of \variable{output/total_number_of_questions.txt}\unskip question answer pairs manually (\variable{output/manually_generated.txt}\unskip) or semi-automatically (\variable{output/automatically_generated.txt}\unskip) compiled from diverse sources.
It covers a large fraction of the topics taught in undergraduate chemistry curricula at various skill levels and can be used with any system that can return text (i.e., also tool-augmented systems).

To contextualize the scores, we also surveyed more than XX experts in chemistry on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans.
To contextualize the scores, we also surveyed more than \variable{output/number_experts.txt}\unskip experts in chemistry (for a total of number of more than \variable{output/number_experts.txt}\unskip) on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans.
Our results indicate that current frontier models perform \enquote{superhuman} on some aspects of chemistry but in many cases, included safety-related ones, might be very misleading.


Expand Down Expand Up @@ -189,7 +190,7 @@ \subsection{Model evaluation workflow}
As initial tests indicated that models sometimes return integers in the form of words, e.g. \enquote{one} instead of \enquote{1}, we also implemented a word-to-number conversion.
In case these hard-coded parsing steps fail, we fall back to using a \gls{llm}, e.g. Claude-2, to parse the completion.
The frequency of this fallback being triggered was very different for different models (see XX).
Manual verification (see XX) indicates that this \gls{llm}-based parsing is not a relevant error source.
Manual verification (see \Cref{sec:manually-verified-parsing}) indicates that this \gls{llm}-based parsing is not a relevant error source.

\paragraph{Models}

Expand All @@ -199,8 +200,8 @@ \subsection{Human baseline}
\paragraph{App} To facilitate the collection of responses, we developed a responsive web application in Typescript using the Next.js\cite{nextjs} app router framework.
This application handles serving the user interface as well as exposes various \gls{rest} \glspl{api} for relevant operations.
We utilize a MySQL\cite{mysql} database and Prisma \gls{orm}\cite{prisma} for efficient database management.
The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses nextAuth for easy and secure user authentication and postMark for sending Emails.
The application is hosted on the Vercel web hosting platform.
The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses NextAuth\cite{nextauth} for easy and secure user authentication and postMark for sending Emails.
The application is hosted on the Vercel web hosting platform.

\paragraph{Question selection}
Since we anticipated that we will not be able to collect enough responses for every question to allow for a meaningful statistical analysis, we decided on showing a relevant subset of all questions to the human scorers.
Expand Down Expand Up @@ -260,21 +261,22 @@ \section*{Author contributions}
\credit{Amir Elahi}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
\credit{Dinga Wonanke}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
\credit{Juliane Eberhardt}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
\credit{Christina Glaubitz}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
\credit{Mara~Wilhemli}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
\credit{Macjonathan~Oreke}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
\credit{Benedict~Emoekabu}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,0,0}
\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,a0,0}
\credit{Aswanth~Krishnan}{0,0,0,0,0,0,0,0,1,0,0,0,0,0}
\credit{Philippe~Schwaller}{1,1,0,1,1,0,0,0,0,1,0,0,0,0}
\credit{Kevin Maik Jablonka}{1,1,1,1,1,1,1,1,1,1,1,1,1,1}

\insertcredits
% \insertcredits

\bibliography{references}

\appendix
% \appendix

\subsection{Parsing verification}
\subsection{Parsing verification} \label{sec:manually-verified-parsing}
For validating the parsing workflow, we randomly sampled four questions per topic and manually verified that the completions of the model were parsed correctly.

\begin{table}
Expand All @@ -287,4 +289,14 @@ \subsection{Parsing verification}
\end{tabularx}
\end{table}


\subsection{Human baseline}

\begin{figure}
\centering
\includegraphics{figures/human_timing.pdf}
\script{analyze_human_data.py}
\label{fig:human_timing}
\end{figure}

\end{document}
8 changes: 8 additions & 0 deletions src/tex/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -300,4 +300,12 @@ @misc{tailwindcss
url = {https://tailwindcss.com/},
journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.},
author = {tailwindcss}
}
@misc{nextauth,
title = {NextAuth.js},
url = {https://next-auth.js.org/},
journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.},
author = {tailwindcss}
}

0 comments on commit 92f439c

Please sign in to comment.