Skip to content

Commit

Permalink
feat(tokenize): write SIG-NEIGH-NBIT datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
tduigou committed Sep 29, 2023
1 parent daf0454 commit f84dc93
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion src/paper/dataset/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,15 @@ def tokenize(src_file: str, model_prefix: str, vocab_size: int = -1):
index=False,
header=False,
)
# SMILES - SIG-NEIGH-NBIT
df_pretokenized[["SMILES", "SIG-NEIGH-NBIT"]].to_csv(
os.path.join(
args.output_directory_str, PAIRS_DIR, f"SIG-NEIGH-NBIT.SMILES.{type_}"
),
sep="\t",
index=False,
header=False,
)
# SIG - ECFP4
df_pretokenized[["SIG", "ECFP4"]].to_csv(
os.path.join(
Expand All @@ -267,7 +276,15 @@ def tokenize(src_file: str, model_prefix: str, vocab_size: int = -1):
index=False,
header=False,
)

# SIG-NEIGH-NBIT - ECFP4
df_pretokenized[["SIG-NEIGH-NBIT", "ECFP4"]].to_csv(
os.path.join(
args.output_directory_str, PAIRS_DIR, f"ECFP4.SIG-NEIGH-NBIT.{type_}"
),
sep="\t",
index=False,
header=False,
)
# SMILES - ECFP4
df_pretokenized[["SMILES", "ECFP4"]].to_csv(
os.path.join(
Expand Down

0 comments on commit f84dc93

Please sign in to comment.