feat(tokenize): write SIG-NEIGH-NBIT datasets

brsynth · Sep 29, 2023 · f84dc93 · f84dc93
1 parent daf0454
commit f84dc93
Showing 1 changed file with 18 additions and 1 deletion.
diff --git a/src/paper/dataset/tokenizer.py b/src/paper/dataset/tokenizer.py
@@ -249,6 +249,15 @@ def tokenize(src_file: str, model_prefix: str, vocab_size: int = -1):
             index=False,
             header=False,
         )
+        # SMILES - SIG-NEIGH-NBIT
+        df_pretokenized[["SMILES", "SIG-NEIGH-NBIT"]].to_csv(
+            os.path.join(
+                args.output_directory_str, PAIRS_DIR, f"SIG-NEIGH-NBIT.SMILES.{type_}"
+            ),
+            sep="\t",
+            index=False,
+            header=False,
+        )
         # SIG - ECFP4
         df_pretokenized[["SIG", "ECFP4"]].to_csv(
             os.path.join(
@@ -267,7 +276,15 @@ def tokenize(src_file: str, model_prefix: str, vocab_size: int = -1):
             index=False,
             header=False,
         )
-
+        # SIG-NEIGH-NBIT - ECFP4
+        df_pretokenized[["SIG-NEIGH-NBIT", "ECFP4"]].to_csv(
+            os.path.join(
+                args.output_directory_str, PAIRS_DIR, f"ECFP4.SIG-NEIGH-NBIT.{type_}"
+            ),
+            sep="\t",
+            index=False,
+            header=False,
+        )
         # SMILES - ECFP4
         df_pretokenized[["SMILES", "ECFP4"]].to_csv(
             os.path.join(