diff --git a/.gitignore b/.gitignore
index 566bf84..3ab8c64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ src/tex/figures/*
 # Miscellaneous
 __pycache__
 *.synctex.gz
+
+*/.DS_Store
diff --git a/Snakefile b/Snakefile
index 162e3c5..b404e87 100644
--- a/Snakefile
+++ b/Snakefile
@@ -14,4 +14,14 @@ rule question_plots:
     output: 
         "src/tex/figures/question_count_barplot.pdf"
     script: 
-        "src/scripts/plot_statistics.py" 
\ No newline at end of file
+        "src/scripts/plot_statistics.py" 
+
+
+rule human_statistics: 
+    input: 
+        "src/scripts/analyze_human_data.py"
+    output: 
+        ["src/tex/output/number_experts.txt", "src/tex/output/total_hours.txt", "src/text/output/human_timing.pdf"]
+        # "src/tex/output/human_questions.csv", "src/tex/output/human_questions.pkl"
+    script: 
+        "src/scripts/compute_human_statistics.py"
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index 0f07a8b..0e21419 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,4 +4,5 @@ dependencies:
   - pip:
       - matplotlib
       - numpy==1.26.3
+      - seaborn
       - git+https://github.com/lamalab-org/chem-bench.git@humanbench
\ No newline at end of file
diff --git a/src/.DS_Store b/src/.DS_Store
index 87278d1..5a6c290 100644
Binary files a/src/.DS_Store and b/src/.DS_Store differ
diff --git a/src/scripts/analyze_human_data.py b/src/scripts/analyze_human_data.py
new file mode 100644
index 0000000..4b23697
--- /dev/null
+++ b/src/scripts/analyze_human_data.py
@@ -0,0 +1,80 @@
+from chembench.analysis import load_all_reports
+import matplotlib.pyplot as plt
+from glob import glob
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import os
+from paths import scripts, data, output
+from plotutils import range_frame
+from utils import (
+    obtain_chembench_repo,
+    ONE_COL_WIDTH_INCH,
+    ONE_COL_GOLDEN_RATIO_HEIGHT_INCH,
+)
+
+plt.style.use(scripts / "lamalab.mplstyle")
+
+
+def make_human_performance_plots():
+    chembench = obtain_chembench_repo()
+    print(chembench)
+    paths = glob(os.path.join(chembench, "reports/humans/**/*.json"), recursive=True)
+    print(paths)
+    dirs = list(set([os.path.dirname(p) for p in paths]))
+    all_results = []
+    for d in dirs:
+        try:
+            results = load_all_reports(d, "../data/")
+            if len(results) < 5:
+                continue
+            all_results.append(results)
+        except Exception as e:
+            print(e)
+            continue
+
+    number_humans = len(all_results)
+
+    with open(output / "number_experts.txt", "w") as f:
+        f.write(f"\SI{{{str(int(number_humans))}}}{{\hour}}")
+
+    long_df = pd.concat(all_results).reset_index(drop=True)
+    long_df["time_s"] = long_df[("time", 0)]
+
+    total_hours = long_df["time_s"].sum() / 3600
+    with open(output / "total_hours.txt", "w") as f:
+        f.write(str(int(total_hours)))
+
+    make_timing_plot(long_df)
+
+
+def make_timing_plot(long_df):
+    fig, ax = plt.subplots(
+        1, 1, figsize=(ONE_COL_WIDTH_INCH, ONE_COL_GOLDEN_RATIO_HEIGHT_INCH)
+    )
+    sns.violinplot(data=long_df, x="all_correct", y="time_s", cut=0, ax=ax)
+    sns.stripplot(
+        data=long_df,
+        x="all_correct",
+        y="time_s",
+        color="black",
+        ax=ax,
+        alpha=0.3,
+        size=2,
+    )
+
+    ax.set_yscale("log")
+    ax.set_ylabel("time / s")
+    ax.set_xlabel("all correct")
+
+    range_frame(
+        ax,
+        np.array([-0.5, 1.5]),
+        np.array([long_df["time_s"].min(), long_df["time_s"].max()]),
+    )
+
+    fig.savefig(output / "human_timing.pdf", bbox_inches="tight")
+
+
+if __name__ == "__main__":
+    make_human_performance_plots()
diff --git a/src/scripts/utils.py b/src/scripts/utils.py
index d036451..c3379d6 100644
--- a/src/scripts/utils.py
+++ b/src/scripts/utils.py
@@ -18,10 +18,11 @@ def obtain_chembench_repo():
     if not os.path.exists(os.path.join(data, "chem-bench")):
         os.chdir(data)
         os.system("git clone https://github.com/lamalab-org/chem-bench.git --depth 1")
-        os.system("git checkout -b humanbench")
+        os.system("git pull origin humanbench")
+        os.system("git checkout humanbench")
     else:
         os.chdir(os.path.join(data, "chem-bench"))
-        os.system("git checkout -b humanbench")
-        os.system("git pull")
+        os.system("git checkout  humanbench")
+        os.system("git pull origin humanbench")
     os.chdir(cwd)
     return os.path.join(data, "chem-bench")
diff --git a/src/tex/ms.tex b/src/tex/ms.tex
index 98fb06b..4806f72 100644
--- a/src/tex/ms.tex
+++ b/src/tex/ms.tex
@@ -58,7 +58,8 @@ \section{Introduction}
 Thus, to better understand what \glspl{llm} can do for chemistry and materials science, and where they might be improved with further developments, a comprehensive analysis is needed. 
 For the development of \glspl{llm}, such evaluation is currently mostly performed via standardized benchmarks such as BigBench\cite{srivastava2022beyond} or the LM Eval Harness.\cite{eval-harness} 
 The former contains, among 204 tasks, only two tasks classified as \enquote{chemistry related} whereas the latter contains no specific chemistry tasks. 
-Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant. 
+Due to the lack of widely excepted standard benchmarks, the developers of chemical language models\cite{jablonka2024leveraging, guo2023large, ahmad2022chemberta2, Cai_2024} frequently utilize language-interfaced\cite{dinh2022lift} tabular datasets such as the ones reported in MoleculeNet,\cite{wu2018moleculenet} Therapeutic Data Commons\cite{huang2021therapeutics} or MatBench.\cite{dunn2020benchmarking} 
+While those evaluations can measure how well models can make predictions for very specific tasks, they only give a poor measure of how useful those models might be as a chemical assistant. 
 
 While some benchmark based on university entrance exams\cite{Zaki_2024, arora2023llms} or automatic text mining\cite{song2023honeybee, wei2021chemistryqa} have been proposed, also those do not satisfy the following basic criteria chemistry benchmarks should satisfy: 
 \begin{itemize}    
@@ -81,7 +82,7 @@ \section{Introduction}
 Our benchmark consists of \variable{output/total_number_of_questions.txt}\unskip question answer pairs manually (\variable{output/manually_generated.txt}\unskip) or semi-automatically (\variable{output/automatically_generated.txt}\unskip) compiled from diverse sources. 
 It covers a large fraction of the topics taught in undergraduate chemistry curricula at various skill levels and can be used with any system that can return text (i.e., also tool-augmented systems).
 
-To contextualize the scores, we also surveyed more than XX experts in chemistry on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans.
+To contextualize the scores, we also surveyed more than \variable{output/number_experts.txt}\unskip experts in chemistry (for a total of number of more than \variable{output/number_experts.txt}\unskip) on a subset of the benchmark corpus to be able to compare the performance of current frontier models with the one of humans.
 Our results indicate that current frontier models perform \enquote{superhuman} on some aspects of chemistry but in many cases, included safety-related ones, might be very misleading. 
 
 
@@ -189,7 +190,7 @@ \subsection{Model evaluation workflow}
 As initial tests indicated that models sometimes return integers in the form of words, e.g. \enquote{one} instead of \enquote{1}, we also implemented a word-to-number conversion. 
 In case these hard-coded parsing steps fail, we fall back to using a \gls{llm}, e.g. Claude-2, to parse the completion. 
 The frequency of this fallback being triggered was very different for different models (see XX). 
-Manual verification (see XX) indicates that this \gls{llm}-based parsing is not a relevant error source.
+Manual verification (see \Cref{sec:manually-verified-parsing}) indicates that this \gls{llm}-based parsing is not a relevant error source.
 
 \paragraph{Models}
 
@@ -199,8 +200,8 @@ \subsection{Human baseline}
 \paragraph{App} To facilitate the collection of responses, we developed a responsive web application in Typescript using the Next.js\cite{nextjs} app router framework. 
 This application handles serving the user interface as well as exposes various \gls{rest} \glspl{api} for relevant operations. 
 We utilize a MySQL\cite{mysql} database and Prisma \gls{orm}\cite{prisma} for efficient database management. 
-The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses nextAuth for easy and secure user authentication and postMark for sending Emails.
- The application is hosted on the Vercel web hosting platform. 
+The web application is styled with Tailwind CSS\cite{tailwindcss} using the shadcn/ui component library and uses NextAuth\cite{nextauth} for easy and secure user authentication and postMark for sending Emails.
+The application is hosted on the Vercel web hosting platform. 
 
 \paragraph{Question selection}
 Since we anticipated that we will not be able to collect enough responses for every question to allow for a meaningful statistical analysis, we decided on showing a relevant subset of all questions to the human scorers. 
@@ -260,21 +261,22 @@ \section*{Author contributions}
 \credit{Amir Elahi}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
 \credit{Dinga Wonanke}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
 \credit{Juliane Eberhardt}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
+\credit{Christina Glaubitz}{0,1,0,0,0,0,0,0,0,0,0,0,0,0}
 \credit{Mara~Wilhemli}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
 \credit{Macjonathan~Oreke}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
 \credit{Benedict~Emoekabu}{0,1,0,0,1,0,0,0,0,0,0,0,0,0}
-\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,0,0}
+\credit{Michael~Pieler}{1,0,0,0,0,0,0,0,0,0,0,0,a0,0}
 \credit{Aswanth~Krishnan}{0,0,0,0,0,0,0,0,1,0,0,0,0,0}
 \credit{Philippe~Schwaller}{1,1,0,1,1,0,0,0,0,1,0,0,0,0}
 \credit{Kevin Maik Jablonka}{1,1,1,1,1,1,1,1,1,1,1,1,1,1}
 
-\insertcredits
+% \insertcredits
 
 \bibliography{references}
 
-\appendix
+% \appendix
 
-\subsection{Parsing verification}
+\subsection{Parsing verification} \label{sec:manually-verified-parsing}
 For validating the parsing workflow, we randomly sampled four questions per topic and manually verified that the completions of the model were parsed correctly. 
 
 \begin{table}
@@ -287,4 +289,14 @@ \subsection{Parsing verification}
     \end{tabularx}
 \end{table}
 
+
+\subsection{Human baseline}
+
+\begin{figure}
+    \centering 
+    \includegraphics{figures/human_timing.pdf}
+    \script{analyze_human_data.py}
+    \label{fig:human_timing}
+\end{figure}
+
 \end{document}  
diff --git a/src/tex/references.bib b/src/tex/references.bib
index 53eea8b..7203413 100644
--- a/src/tex/references.bib
+++ b/src/tex/references.bib
@@ -300,4 +300,12 @@ @misc{tailwindcss
   url     = {https://tailwindcss.com/},
   journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.},
   author  = {tailwindcss}
+} 
+
+
+@misc{nextauth,
+  title   = {NextAuth.js},
+  url     = {https://next-auth.js.org/},
+  journal = {Tailwind CSS - Rapidly build modern websites without ever leaving your HTML.},
+  author  = {tailwindcss}
 } 
\ No newline at end of file