From 42b1a68d8c46b9f0bef5c16185ecdd5ecdcf71bd Mon Sep 17 00:00:00 2001
From: Massimiliano Lupo Pasini <massimiliano.lupo.pasini@gmail.com>
Date: Wed, 6 Nov 2024 14:47:42 -0500
Subject: [PATCH] Alexandria add download bash script (#281)

* bash script to download dataset

* download dataset updated

* exclude download of index.html

* try-cath exceptions in alexandria example fixed

* formatting fixed

* natoms count fixed

* fix json files

---------

Co-authored-by: Massimiliano Lupo Pasini <mlupopa@login10.frontier.olcf.ornl.gov>
Co-authored-by: Massimiliano Lupo Pasini <mlupopa@andes-login1.olcf.ornl.gov>
Co-authored-by: Jong Choi <choij@ornl.gov>
---
 examples/alexandria/alexandria_energy.json | 128 ++++++++++++---------
 examples/alexandria/alexandria_forces.json | 128 ++++++++++++---------
 examples/alexandria/download_dataset.sh    |  25 ++++
 examples/alexandria/train.py               |  43 +++++--
 4 files changed, 205 insertions(+), 119 deletions(-)
 create mode 100644 examples/alexandria/download_dataset.sh

diff --git a/examples/alexandria/alexandria_energy.json b/examples/alexandria/alexandria_energy.json
index 1357a5e65..92fb8a165 100644
--- a/examples/alexandria/alexandria_energy.json
+++ b/examples/alexandria/alexandria_energy.json
@@ -1,58 +1,78 @@
 {
-    "Verbosity": {
-        "level": 2
-    },
-    "NeuralNetwork": {
-        "Architecture": {
-            "model_type": "EGNN",
-            "equivariance": true,
-            "radius": 5.0,
-            "max_neighbours": 100000,
-            "num_gaussians": 50,
-            "envelope_exponent": 5,
-            "int_emb_size": 64,
-            "basis_emb_size": 8,
-            "out_emb_size": 128,
-            "num_after_skip": 2,
-            "num_before_skip": 1,
-            "num_radial": 6,
-            "num_spherical": 7,
-            "num_filters": 126,
-            "edge_features": ["length"],
-            "hidden_dim": 50,
-            "num_conv_layers": 3,
-            "output_heads": {
-                "graph":{
-                    "num_sharedlayers": 2,
-                    "dim_sharedlayers": 50,
-                    "num_headlayers": 2,
-                    "dim_headlayers": [50,25]
-                }
-            },
-            "task_weights": [1.0]
-        },
-        "Variables_of_interest": {
-            "input_node_features": [0, 1, 2, 3],
-            "output_names": ["energy"],
-            "output_index": [0],
-            "output_dim": [1],
-            "type": ["graph"]
-        },
-        "Training": {
-            "num_epoch": 50,
-            "perc_train": 0.8,
-            "loss_function_type": "mae",
-            "batch_size": 32,
-            "continue": 0,
-            "Optimizer": {
-                "type": "AdamW",
-                "learning_rate": 1e-3
-            }
+  "Verbosity": {
+    "level": 2
+  },
+  "NeuralNetwork": {
+    "Architecture": {
+      "model_type": "EGNN",
+      "equivariance": true,
+      "radius": 5,
+      "max_neighbours": 100000,
+      "num_gaussians": 50,
+      "envelope_exponent": 5,
+      "int_emb_size": 64,
+      "basis_emb_size": 8,
+      "out_emb_size": 128,
+      "num_after_skip": 2,
+      "num_before_skip": 1,
+      "num_radial": 6,
+      "num_spherical": 7,
+      "num_filters": 126,
+      "edge_features": [
+        "length"
+      ],
+      "hidden_dim": 50,
+      "num_conv_layers": 3,
+      "output_heads": {
+        "graph": {
+          "num_sharedlayers": 2,
+          "dim_sharedlayers": 50,
+          "num_headlayers": 2,
+          "dim_headlayers": [
+            50,
+            25
+          ]
         }
+      },
+      "task_weights": [
+        1
+      ]
+    },
+    "Variables_of_interest": {
+      "input_node_features": [
+        0,
+        1,
+        2,
+        3
+      ],
+      "output_names": [
+        "energy"
+      ],
+      "output_index": [
+        0
+      ],
+      "output_dim": [
+        1
+      ],
+      "type": [
+        "graph"
+      ]
     },
-    "Visualization": {
-        "plot_init_solution": true,
-        "plot_hist_solution": false,
-        "create_plots": true
+    "Training": {
+      "num_epoch": 50,
+      "perc_train": 0.8,
+      "loss_function_type": "mae",
+      "batch_size": 32,
+      "continue": 0,
+      "Optimizer": {
+        "type": "AdamW",
+        "learning_rate": 0.001
+      }
     }
-}
\ No newline at end of file
+  },
+  "Visualization": {
+    "plot_init_solution": true,
+    "plot_hist_solution": false,
+    "create_plots": true
+  }
+}
diff --git a/examples/alexandria/alexandria_forces.json b/examples/alexandria/alexandria_forces.json
index 5a2ac71f8..b22c98992 100644
--- a/examples/alexandria/alexandria_forces.json
+++ b/examples/alexandria/alexandria_forces.json
@@ -1,58 +1,78 @@
 {
-    "Verbosity": {
-        "level": 2
-    },
-   "NeuralNetwork": {
-        "Architecture": {
-            "model_type": "EGNN",
-            "equivariance": true,
-            "radius": 5.0,
-            "max_neighbours": 100000,
-            "num_gaussians": 50,
-            "envelope_exponent": 5,
-            "int_emb_size": 64,
-            "basis_emb_size": 8,
-            "out_emb_size": 128,
-            "num_after_skip": 2,
-            "num_before_skip": 1,
-            "num_radial": 6,
-            "num_spherical": 7,
-            "num_filters": 126,
-            "edge_features": ["length"],
-            "hidden_dim": 50,
-            "num_conv_layers": 3,
-            "output_heads": {
-                "node": {
-                    "num_headlayers": 2,
-                    "dim_headlayers": [200,200],
-                    "type": "mlp"
-                }
-            },
-            "task_weights": [1.0]
-        },
-        "Variables_of_interest": {
-            "input_node_features": [0, 1, 2, 3],
-            "output_names": ["forces"],
-            "output_index": [2],
-            "output_dim": [3],
-            "type": ["node"]
-        },
-        "Training": {
-            "num_epoch": 50,
-            "EarlyStopping": true,
-            "perc_train": 0.9,
-            "loss_function_type": "mae",
-            "batch_size": 32,
-            "continue": 0,
-            "Optimizer": {
-                "type": "AdamW",
-                "learning_rate": 1e-3
-            }
+  "Verbosity": {
+    "level": 2
+  },
+  "NeuralNetwork": {
+    "Architecture": {
+      "model_type": "EGNN",
+      "equivariance": true,
+      "radius": 5,
+      "max_neighbours": 100000,
+      "num_gaussians": 50,
+      "envelope_exponent": 5,
+      "int_emb_size": 64,
+      "basis_emb_size": 8,
+      "out_emb_size": 128,
+      "num_after_skip": 2,
+      "num_before_skip": 1,
+      "num_radial": 6,
+      "num_spherical": 7,
+      "num_filters": 126,
+      "edge_features": [
+        "length"
+      ],
+      "hidden_dim": 50,
+      "num_conv_layers": 3,
+      "output_heads": {
+        "node": {
+          "num_headlayers": 2,
+          "dim_headlayers": [
+            200,
+            200
+          ],
+          "type": "mlp"
         }
+      },
+      "task_weights": [
+        1
+      ]
+    },
+    "Variables_of_interest": {
+      "input_node_features": [
+        0,
+        1,
+        2,
+        3
+      ],
+      "output_names": [
+        "forces"
+      ],
+      "output_index": [
+        2
+      ],
+      "output_dim": [
+        3
+      ],
+      "type": [
+        "node"
+      ]
     },
-    "Visualization": {
-        "plot_init_solution": true,
-        "plot_hist_solution": false,
-        "create_plots": true
+    "Training": {
+      "num_epoch": 50,
+      "EarlyStopping": true,
+      "perc_train": 0.9,
+      "loss_function_type": "mae",
+      "batch_size": 32,
+      "continue": 0,
+      "Optimizer": {
+        "type": "AdamW",
+        "learning_rate": 0.001
+      }
     }
-}
\ No newline at end of file
+  },
+  "Visualization": {
+    "plot_init_solution": true,
+    "plot_hist_solution": false,
+    "create_plots": true
+  }
+}
diff --git a/examples/alexandria/download_dataset.sh b/examples/alexandria/download_dataset.sh
new file mode 100644
index 000000000..bb47a76d6
--- /dev/null
+++ b/examples/alexandria/download_dataset.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# URL to download from
+URL="https://alexandria.icams.rub.de/data/"
+
+# Directory where files will be saved
+OUTPUT_DIR="./dataset/compressed_data"
+
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+
+# Use wget to recursively download all files and directories
+wget --recursive \
+     --no-parent \
+     --continue \
+     --no-clobber \
+     --convert-links \
+     --cut-dirs=1 \
+     --no-check-certificate \
+     --reject-regex="(/older/|/geo_opt_paths/)" \
+     --reject "*index.html*" \
+     --directory-prefix="$OUTPUT_DIR" \
+     "$URL"
+
+echo "Download complete. All files saved to $OUTPUT_DIR."
diff --git a/examples/alexandria/train.py b/examples/alexandria/train.py
index d6492ebf4..b9cb52c90 100644
--- a/examples/alexandria/train.py
+++ b/examples/alexandria/train.py
@@ -51,6 +51,16 @@ def info(*args, logtype="info", sep=" "):
     getattr(logging, logtype)(sep.join(map(str, args)))
 
 
+def list_directories(path):
+    # List all items in the given path
+    items = os.listdir(path)
+
+    # Filter out items that are directories
+    directories = [item for item in items if os.path.isdir(os.path.join(path, item))]
+
+    return directories
+
+
 periodic_table = generate_dictionary_elements()
 
 # Reversing the dictionary so the elements become keys and the atomic numbers become values
@@ -75,11 +85,15 @@ def __init__(self, dirpath, var_config, energy_per_atom=True, dist=False):
 
         self.radius_graph = RadiusGraph(5.0, loop=False, max_num_neighbors=50)
 
-        indices = ["pascal", "pbe", "pbe_1d", "pbe_2d", "pbesol", "scan"]
+        list_dirs = list_directories(
+            os.path.join(dirpath, "compressed_data", "alexandria.icams.rub.de")
+        )
 
-        for index in indices:
+        for index in list_dirs:
 
-            subdirpath = os.path.join(dirpath, "compressed_data", index)
+            subdirpath = os.path.join(
+                dirpath, "compressed_data", "alexandria.icams.rub.de", index
+            )
 
             total_file_list = os.listdir(subdirpath)
 
@@ -95,7 +109,7 @@ def __init__(self, dirpath, var_config, energy_per_atom=True, dist=False):
                 if filepath.endswith("bz2"):
                     self.process_file_content(os.path.join(subdirpath, filepath))
                 else:
-                    print(f"{filepath} is not a .bz2 file to decompress")
+                    print(f"{filepath} is not a .bz2 file to decompress", flush=True)
 
     def get_data_dict(self, computed_entry_dict):
         """
@@ -124,15 +138,15 @@ def get_magmoms_array_from_structure(structure):
             assert pos.shape[1] == 3, "pos tensor does not have 3 coordinates per atom"
             assert pos.shape[0] > 0, "pos tensor does not have any atoms"
         except:
-            print(f"Structure {entry_id} does not have positional sites")
+            print(f"Structure {entry_id} does not have positional sites", flush=True)
             return data_object
-        natoms = torch.IntTensor([pos.shape[1]])
+        natoms = torch.IntTensor([pos.shape[0]])
 
         cell = None
         try:
             cell = torch.tensor(structure["lattice"]["matrix"]).to(torch.float32)
         except:
-            print(f"Structure {entry_id} does not have cell")
+            print(f"Structure {entry_id} does not have cell", flush=True)
             return data_object
 
         atomic_numbers = None
@@ -151,14 +165,17 @@ def get_magmoms_array_from_structure(structure):
                 pos.shape[0] == atomic_numbers.shape[0]
             ), f"pos.shape[0]:{pos.shape[0]} does not match with atomic_numbers.shape[0]:{atomic_numbers.shape[0]}"
         except:
-            print(f"Structure {entry_id} does not have positional atomic numbers")
+            print(
+                f"Structure {entry_id} does not have positional atomic numbers",
+                flush=True,
+            )
             return data_object
 
         forces_numpy = None
         try:
             forces_numpy = get_forces_array_from_structure(structure)
         except:
-            print(f"Structure {entry_id} does not have forces")
+            print(f"Structure {entry_id} does not have forces", flush=True)
             return data_object
         forces = torch.tensor(forces_numpy).to(torch.float32)
 
@@ -173,7 +190,7 @@ def get_magmoms_array_from_structure(structure):
         try:
             total_energy = computed_entry_dict["data"]["energy_total"]
         except:
-            print(f"Structure {entry_id} does not have total energy")
+            print(f"Structure {entry_id} does not have total energy", flush=True)
             return data_object
         total_energy_tensor = (
             torch.tensor(total_energy).unsqueeze(0).unsqueeze(1).to(torch.float32)
@@ -288,8 +305,12 @@ def process_file_content(self, filepath):
                 self.dataset.extend(filtered_computed_entry_dict)
 
             except OSError as e:
-                print("Failed to decompress data:", e)
+                print("Failed to decompress data:", e, flush=True)
                 decompressed_data = None
+            except json.JSONDecodeError as e:
+                print("Failed to decode JSON:", e, flush=True)
+            except Exception as e:
+                print("An error occurred:", e, flush=True)
 
     def len(self):
         return len(self.dataset)