lamalab-org · AdrianM0 · Aug 19, 2024 · Aug 8, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/src/scripts/analyze_performance_per_source.py b/src/scripts/analyze_performance_per_source.py
@@ -39,7 +39,7 @@ def obtain_score_for_subset(df, subset):
  "number of isomers",
  "number of NMR signals",
  "GFK (chemical safety)",
- "DAI",
+ "DAI (daily allowed intake)",
  "GHS pictograms",
  "name to SMILES",
  "SMILES to name",

diff --git a/src/scripts/joint_analysis_confidence_performance.py b/src/scripts/joint_analysis_confidence_performance.py
@@ -124,25 +124,26 @@ def make_plot_of_confidence_vs_performance(merged_dicts, suffix: str = ""):
  average_performance = df.groupby("estimate")["all_correct_"].mean()
  stdev = df.groupby("estimate")["all_correct_"].apply(sem)
  ax[i].plot(
- average_performance.index - 1,
+ average_performance.index,
  average_performance,
  color=model_color_map[model],
  marker="o",
  )
  ax[i].errorbar(
- average_performance.index - 1,
+ average_performance.index,
  average_performance,
  yerr=stdev,
  fmt="none",
  color=model_color_map[model],
  )
 
- range_frame(ax[i], np.array([0, 4]), np.array([0, 1]))
+ range_frame(ax[i], np.array([1, 5]), np.array([0, 1])) # Adjusted x-axis range
 
  # set shared y axis label
  fig.text(0.01, 0.5, "completely correct", va="center", rotation="vertical")
 
  ax[-1].set_xlabel("confidence estimate")
+ ax[-1].set_xticks([1, 2, 3, 4, 5]) # Set x-axis ticks to ordinal scale
 
  fig.tight_layout()
  fig.savefig(

diff --git a/src/tex/appendix.tex b/src/tex/appendix.tex
@@ -208,7 +208,7 @@ \subsection{Performance as a function of molecular features}
 
 \subsection{Influence of model scale}
 To obtain first insights in how the performance \glspl{llm} depends on scale, we tested the \glspl{llm} of the LLaMA series. 
-Interestingly, we find that the 7B and 70B models perform comparably, with the 7B showing lower performance (fraction of correct answers for the 7B, 13B, and 70B models are \variable{output/llama/llama_7b.txt}, \variable{output/llama/llama_13b.txt}, \variable{output/llama/llama_70b.txt}).
+Interestingly, we find that the 7B and 70B models perform comparably, with the 13B showing lower performance (fraction of correct answers for the 7B, 13B, and 70B models are \variable{output/llama/llama_7b.txt}, \variable{output/llama/llama_13b.txt}, \variable{output/llama/llama_70b.txt}).
 
 Note that such analyses are difficult as models are typically not directly comparable in terms of dataset and training protocol.\autocite{biderman2023pythia}
 

diff --git a/src/tex/ms.tex b/src/tex/ms.tex
@@ -106,7 +106,7 @@ \subsection{Benchmark corpus}
  \centering
  \includegraphics{figures/question_count_barplot.pdf}
  \caption{\textbf{Number of questions for different topics.} The topics have been assigned using a combination of a rule-based system (mostly based on the source the question has been sampled from) and a classifier operating on word embeddings of the questions. 
- The figure shows that not all aspects of chemistry are equally represented in our corpus. The \chembench corpus, by design, currently focuses on safety-related aspects, which is also evident in \Cref{fig:question_diversity}. This figure represents the combined count of \glstext{mcq} and open-ended questions.}
+ The figure shows that not all aspects of chemistry are equally represented in our corpus. The \chembench corpus, by design, currently focuses on safety-related aspects, which is also evident in \Cref{fig:question_diversity}. This figure represents the combined count of multiple-choice questions (\gls{mcq}) and open-ended questions.}
  \label{fig:topic_barplot}
  \script{plot_statistics.py}
 \end{figure}
@@ -323,7 +323,7 @@ \subsection{Model evaluation workflow}
 We used Galactica (120b)\autocite{taylor2022galactica} with the default settings.
 
 
-\subparagraph{Instruction-tuned models} In addition, we used Claude 2, Claude3 (Opus),\autocite{anthropicClaudeModelFamily2024} GPT-4,\autocite{openai2024gpt4} GPT-3.5-turbo,\autocite{brown2020language} Gemini Pro,\autocite{gemini} Mixtral-8x7b\autocite{jiang2024mixtral} LLaMA2 (7B, 13B, 70B),\autocite{touvron2023llama} as well as the 7B chat model from Perplexity.AI.
+\subparagraph{Instruction-tuned models} In addition, we used Claude 2, Claude3 (Opus),\autocite{anthropicClaudeModelFamily2024} GPT-4,\autocite{openai2024gpt4} GPT-3.5-turbo,\autocite{brown2020language} Gemini Pro,\autocite{gemini} Mixtral-8x7b,\autocite{jiang2024mixtral} LLaMA2 (7B, 13B, 70B),\autocite{touvron2023llama} as well as the 7B chat model from Perplexity.AI.
 
 \subparagraph{Tool augmented models}
 In addition to directly prompting \glspl{llm}, we also investigated the performance of tool-augmented systems.