update paper

lamalab-org · Mar 8, 2024 · 9e3e1d5 · 9e3e1d5
1 parent a76aa81
commit 9e3e1d5
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 12 deletions.
diff --git a/Snakefile b/Snakefile
@@ -3,7 +3,7 @@ rule question_statistics:
         "src/scripts/compute_basic_statistics.py"
     output: 
         # for caching perhaps generate directory https://github.com/showyourwork/showyourwork/issues/119
-        ["src/tex/output/total_number_of_questions.txt", "src/tex/output/automatically_generated.txt", "src/tex/output/manually_generated.txt", "src/tex/output/questions.csv", "src/tex/output/questions.pkl"]
+        ["src/tex/output/total_number_of_questions.txt", "src/tex/output/automatically_generated.txt", "src/tex/output/manually_generated.txt", "src/tex/output/questions.csv", "src/tex/output/questions.pkl", "src/tex/output/num_dai.txt","src/tex/output/num_h_statements.txt", "src/tex/output/num_pictograms.txt"]
     script: 
         "src/scripts/compute_basic_statistics.py"
 

diff --git a/src/scripts/compute_basic_statistics.py b/src/scripts/compute_basic_statistics.py
@@ -1,6 +1,7 @@
 from utils import obtain_chembench_repo
 from chembench.analysis import classify_questions
 from paths import output
+from glob import glob
 
 
 def load_data():
@@ -29,6 +30,23 @@ def load_data():
     df.to_csv(output / "questions.csv")
     df.to_pickle(output / "questions.pkl")
 
+    dai_data = glob(chembench_repo / "data" / "safety" / "pubchem_data" / "DAI*.json")
+    h_statements = glob(
+        chembench_repo / "data" / "safety" / "pubchem_data" / "h_state*.json"
+    )
+    pictograms = glob(
+        chembench_repo / "data" / "safety" / "pubchem_data" / "pictogram*.json"
+    )
+
+    with open(output / "num_dai.txt", "w") as f:
+        f.write(str(len(dai_data)) + "\endinput")
+
+    with open(output / "num_h_statements.txt", "w") as f:
+        f.write(str(len(h_statements)) + "\endinput")
+
+    with open(output / "num_pictograms.txt", "w") as f:
+        f.write(str(len(pictograms)) + "\endinput")
+
 
 if __name__ == "__main__":
     load_data()
diff --git a/src/scripts/plot_statistics.py b/src/scripts/plot_statistics.py
@@ -4,7 +4,7 @@
 
 plt.style.use(scripts / "lamalab.mplstyle")
 import pandas as pd
-
+from glob import glob
 
 import numpy as np
 from plotutils import range_frame

diff --git a/src/tex/acronymns.tex b/src/tex/acronymns.tex
@@ -4,4 +4,7 @@
 \newacronym{api}{API}{application programming interface}
 \newacronym{mcq}{MCQ}{multiple-choice question}
 \newacronym{rest}{REST}{Representational State Transfer}
-\newacronym{orm}{ORM}{object relational mapping}
+\newacronym{orm}{ORM}{object relational mapping}
+\newacronym{dai}{DAI}{daily allowed intake}
+\newacronym{ghs}{GHS}{Globally Harmonized System of Classification and Labelling of Chemicals}
+\newacronym{who}{WHO}{World Health Organization}
diff --git a/src/tex/ms.tex b/src/tex/ms.tex
@@ -148,9 +148,13 @@ \subsection{Curation workflow} \label{sec:curation}
 \subparagraph{Number of NMR peaks} To generate tasks about the number of NMR peaks, we randomly sampled SMILES from the ZINC dataset and then used OpenChemLib\cite{openchemlib} to compute the number of diasterotopically distinct hydrogen atoms. 
 We then sampled from the set of number of peaks (excluding the correct answer) to obtain the false answer options.
 
-\subparagraph{GHS classification}
+\subparagraph{GHS, hazard statements and DAI}
+The \gls{ghs} classification, hazard statements and  \gls{dai} data have been extracted from PubChem. \cite{pubchem}
+The former two have been mined via the PubChem \gls{api}, while the latter has been manually compiled.
+The \glspl{dai} have been curated to contain only records approved by the \gls{who} 
+The chemicals in this class of questions belong to one of the three classes: pesticides (e.g., calcium arsenate), insecticides (e.g., cyfluthrin) or herbicides (e.g. 2,4-D). 
+All data was saved in tabular form, and then we programmatically created questions (\variable{output/num_h_statements.txt} hazard statement definitions, \variable{output/num_pictograms.txt} chemical-\gls{ghs} pictogram matching, \variable{output/num_dai.txt} \glspl{dai}).
 
-\subparagraph{Hazard statements}
 
 \subparagraph{Number of isomers}
 We used MAYGEN\cite{Yirik_2021} to compute the number of isomers for a set of randomly sampled SMILES from the ZINC dataset.

diff --git a/src/tex/safety_data.tex b/src/tex/safety_data.tex