Skip to content

Commit

Permalink
update paper
Browse files Browse the repository at this point in the history
  • Loading branch information
kjappelbaum committed Mar 8, 2024
1 parent a76aa81 commit 9e3e1d5
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ rule question_statistics:
"src/scripts/compute_basic_statistics.py"
output:
# for caching perhaps generate directory https://github.com/showyourwork/showyourwork/issues/119
["src/tex/output/total_number_of_questions.txt", "src/tex/output/automatically_generated.txt", "src/tex/output/manually_generated.txt", "src/tex/output/questions.csv", "src/tex/output/questions.pkl"]
["src/tex/output/total_number_of_questions.txt", "src/tex/output/automatically_generated.txt", "src/tex/output/manually_generated.txt", "src/tex/output/questions.csv", "src/tex/output/questions.pkl", "src/tex/output/num_dai.txt","src/tex/output/num_h_statements.txt", "src/tex/output/num_pictograms.txt"]
script:
"src/scripts/compute_basic_statistics.py"

Expand Down
18 changes: 18 additions & 0 deletions src/scripts/compute_basic_statistics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from utils import obtain_chembench_repo
from chembench.analysis import classify_questions
from paths import output
from glob import glob


def load_data():
Expand Down Expand Up @@ -29,6 +30,23 @@ def load_data():
df.to_csv(output / "questions.csv")
df.to_pickle(output / "questions.pkl")

dai_data = glob(chembench_repo / "data" / "safety" / "pubchem_data" / "DAI*.json")
h_statements = glob(
chembench_repo / "data" / "safety" / "pubchem_data" / "h_state*.json"
)
pictograms = glob(
chembench_repo / "data" / "safety" / "pubchem_data" / "pictogram*.json"
)

with open(output / "num_dai.txt", "w") as f:
f.write(str(len(dai_data)) + "\endinput")

with open(output / "num_h_statements.txt", "w") as f:
f.write(str(len(h_statements)) + "\endinput")

with open(output / "num_pictograms.txt", "w") as f:
f.write(str(len(pictograms)) + "\endinput")


if __name__ == "__main__":
load_data()
2 changes: 1 addition & 1 deletion src/scripts/plot_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

plt.style.use(scripts / "lamalab.mplstyle")
import pandas as pd

from glob import glob

import numpy as np
from plotutils import range_frame
Expand Down
5 changes: 4 additions & 1 deletion src/tex/acronymns.tex
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@
\newacronym{api}{API}{application programming interface}
\newacronym{mcq}{MCQ}{multiple-choice question}
\newacronym{rest}{REST}{Representational State Transfer}
\newacronym{orm}{ORM}{object relational mapping}
\newacronym{orm}{ORM}{object relational mapping}
\newacronym{dai}{DAI}{daily allowed intake}
\newacronym{ghs}{GHS}{Globally Harmonized System of Classification and Labelling of Chemicals}
\newacronym{who}{WHO}{World Health Organization}
8 changes: 6 additions & 2 deletions src/tex/ms.tex
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,13 @@ \subsection{Curation workflow} \label{sec:curation}
\subparagraph{Number of NMR peaks} To generate tasks about the number of NMR peaks, we randomly sampled SMILES from the ZINC dataset and then used OpenChemLib\cite{openchemlib} to compute the number of diasterotopically distinct hydrogen atoms.
We then sampled from the set of number of peaks (excluding the correct answer) to obtain the false answer options.

\subparagraph{GHS classification}
\subparagraph{GHS, hazard statements and DAI}
The \gls{ghs} classification, hazard statements and \gls{dai} data have been extracted from PubChem. \cite{pubchem}
The former two have been mined via the PubChem \gls{api}, while the latter has been manually compiled.
The \glspl{dai} have been curated to contain only records approved by the \gls{who}
The chemicals in this class of questions belong to one of the three classes: pesticides (e.g., calcium arsenate), insecticides (e.g., cyfluthrin) or herbicides (e.g. 2,4-D).
All data was saved in tabular form, and then we programmatically created questions (\variable{output/num_h_statements.txt} hazard statement definitions, \variable{output/num_pictograms.txt} chemical-\gls{ghs} pictogram matching, \variable{output/num_dai.txt} \glspl{dai}).

\subparagraph{Hazard statements}

\subparagraph{Number of isomers}
We used MAYGEN\cite{Yirik_2021} to compute the number of isomers for a set of randomly sampled SMILES from the ZINC dataset.
Expand Down
7 changes: 0 additions & 7 deletions src/tex/safety_data.tex

This file was deleted.

0 comments on commit 9e3e1d5

Please sign in to comment.