From 42b1a68d8c46b9f0bef5c16185ecdd5ecdcf71bd Mon Sep 17 00:00:00 2001 From: Massimiliano Lupo Pasini Date: Wed, 6 Nov 2024 14:47:42 -0500 Subject: [PATCH] Alexandria add download bash script (#281) * bash script to download dataset * download dataset updated * exclude download of index.html * try-cath exceptions in alexandria example fixed * formatting fixed * natoms count fixed * fix json files --------- Co-authored-by: Massimiliano Lupo Pasini Co-authored-by: Massimiliano Lupo Pasini Co-authored-by: Jong Choi --- examples/alexandria/alexandria_energy.json | 128 ++++++++++++--------- examples/alexandria/alexandria_forces.json | 128 ++++++++++++--------- examples/alexandria/download_dataset.sh | 25 ++++ examples/alexandria/train.py | 43 +++++-- 4 files changed, 205 insertions(+), 119 deletions(-) create mode 100644 examples/alexandria/download_dataset.sh diff --git a/examples/alexandria/alexandria_energy.json b/examples/alexandria/alexandria_energy.json index 1357a5e65..92fb8a165 100644 --- a/examples/alexandria/alexandria_energy.json +++ b/examples/alexandria/alexandria_energy.json @@ -1,58 +1,78 @@ { - "Verbosity": { - "level": 2 - }, - "NeuralNetwork": { - "Architecture": { - "model_type": "EGNN", - "equivariance": true, - "radius": 5.0, - "max_neighbours": 100000, - "num_gaussians": 50, - "envelope_exponent": 5, - "int_emb_size": 64, - "basis_emb_size": 8, - "out_emb_size": 128, - "num_after_skip": 2, - "num_before_skip": 1, - "num_radial": 6, - "num_spherical": 7, - "num_filters": 126, - "edge_features": ["length"], - "hidden_dim": 50, - "num_conv_layers": 3, - "output_heads": { - "graph":{ - "num_sharedlayers": 2, - "dim_sharedlayers": 50, - "num_headlayers": 2, - "dim_headlayers": [50,25] - } - }, - "task_weights": [1.0] - }, - "Variables_of_interest": { - "input_node_features": [0, 1, 2, 3], - "output_names": ["energy"], - "output_index": [0], - "output_dim": [1], - "type": ["graph"] - }, - "Training": { - "num_epoch": 50, - "perc_train": 0.8, - "loss_function_type": "mae", - "batch_size": 32, - "continue": 0, - "Optimizer": { - "type": "AdamW", - "learning_rate": 1e-3 - } + "Verbosity": { + "level": 2 + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "EGNN", + "equivariance": true, + "radius": 5, + "max_neighbours": 100000, + "num_gaussians": 50, + "envelope_exponent": 5, + "int_emb_size": 64, + "basis_emb_size": 8, + "out_emb_size": 128, + "num_after_skip": 2, + "num_before_skip": 1, + "num_radial": 6, + "num_spherical": 7, + "num_filters": 126, + "edge_features": [ + "length" + ], + "hidden_dim": 50, + "num_conv_layers": 3, + "output_heads": { + "graph": { + "num_sharedlayers": 2, + "dim_sharedlayers": 50, + "num_headlayers": 2, + "dim_headlayers": [ + 50, + 25 + ] } + }, + "task_weights": [ + 1 + ] + }, + "Variables_of_interest": { + "input_node_features": [ + 0, + 1, + 2, + 3 + ], + "output_names": [ + "energy" + ], + "output_index": [ + 0 + ], + "output_dim": [ + 1 + ], + "type": [ + "graph" + ] }, - "Visualization": { - "plot_init_solution": true, - "plot_hist_solution": false, - "create_plots": true + "Training": { + "num_epoch": 50, + "perc_train": 0.8, + "loss_function_type": "mae", + "batch_size": 32, + "continue": 0, + "Optimizer": { + "type": "AdamW", + "learning_rate": 0.001 + } } -} \ No newline at end of file + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + } +} diff --git a/examples/alexandria/alexandria_forces.json b/examples/alexandria/alexandria_forces.json index 5a2ac71f8..b22c98992 100644 --- a/examples/alexandria/alexandria_forces.json +++ b/examples/alexandria/alexandria_forces.json @@ -1,58 +1,78 @@ { - "Verbosity": { - "level": 2 - }, - "NeuralNetwork": { - "Architecture": { - "model_type": "EGNN", - "equivariance": true, - "radius": 5.0, - "max_neighbours": 100000, - "num_gaussians": 50, - "envelope_exponent": 5, - "int_emb_size": 64, - "basis_emb_size": 8, - "out_emb_size": 128, - "num_after_skip": 2, - "num_before_skip": 1, - "num_radial": 6, - "num_spherical": 7, - "num_filters": 126, - "edge_features": ["length"], - "hidden_dim": 50, - "num_conv_layers": 3, - "output_heads": { - "node": { - "num_headlayers": 2, - "dim_headlayers": [200,200], - "type": "mlp" - } - }, - "task_weights": [1.0] - }, - "Variables_of_interest": { - "input_node_features": [0, 1, 2, 3], - "output_names": ["forces"], - "output_index": [2], - "output_dim": [3], - "type": ["node"] - }, - "Training": { - "num_epoch": 50, - "EarlyStopping": true, - "perc_train": 0.9, - "loss_function_type": "mae", - "batch_size": 32, - "continue": 0, - "Optimizer": { - "type": "AdamW", - "learning_rate": 1e-3 - } + "Verbosity": { + "level": 2 + }, + "NeuralNetwork": { + "Architecture": { + "model_type": "EGNN", + "equivariance": true, + "radius": 5, + "max_neighbours": 100000, + "num_gaussians": 50, + "envelope_exponent": 5, + "int_emb_size": 64, + "basis_emb_size": 8, + "out_emb_size": 128, + "num_after_skip": 2, + "num_before_skip": 1, + "num_radial": 6, + "num_spherical": 7, + "num_filters": 126, + "edge_features": [ + "length" + ], + "hidden_dim": 50, + "num_conv_layers": 3, + "output_heads": { + "node": { + "num_headlayers": 2, + "dim_headlayers": [ + 200, + 200 + ], + "type": "mlp" } + }, + "task_weights": [ + 1 + ] + }, + "Variables_of_interest": { + "input_node_features": [ + 0, + 1, + 2, + 3 + ], + "output_names": [ + "forces" + ], + "output_index": [ + 2 + ], + "output_dim": [ + 3 + ], + "type": [ + "node" + ] }, - "Visualization": { - "plot_init_solution": true, - "plot_hist_solution": false, - "create_plots": true + "Training": { + "num_epoch": 50, + "EarlyStopping": true, + "perc_train": 0.9, + "loss_function_type": "mae", + "batch_size": 32, + "continue": 0, + "Optimizer": { + "type": "AdamW", + "learning_rate": 0.001 + } } -} \ No newline at end of file + }, + "Visualization": { + "plot_init_solution": true, + "plot_hist_solution": false, + "create_plots": true + } +} diff --git a/examples/alexandria/download_dataset.sh b/examples/alexandria/download_dataset.sh new file mode 100644 index 000000000..bb47a76d6 --- /dev/null +++ b/examples/alexandria/download_dataset.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# URL to download from +URL="https://alexandria.icams.rub.de/data/" + +# Directory where files will be saved +OUTPUT_DIR="./dataset/compressed_data" + +# Create output directory if it doesn't exist +mkdir -p "$OUTPUT_DIR" + +# Use wget to recursively download all files and directories +wget --recursive \ + --no-parent \ + --continue \ + --no-clobber \ + --convert-links \ + --cut-dirs=1 \ + --no-check-certificate \ + --reject-regex="(/older/|/geo_opt_paths/)" \ + --reject "*index.html*" \ + --directory-prefix="$OUTPUT_DIR" \ + "$URL" + +echo "Download complete. All files saved to $OUTPUT_DIR." diff --git a/examples/alexandria/train.py b/examples/alexandria/train.py index d6492ebf4..b9cb52c90 100644 --- a/examples/alexandria/train.py +++ b/examples/alexandria/train.py @@ -51,6 +51,16 @@ def info(*args, logtype="info", sep=" "): getattr(logging, logtype)(sep.join(map(str, args))) +def list_directories(path): + # List all items in the given path + items = os.listdir(path) + + # Filter out items that are directories + directories = [item for item in items if os.path.isdir(os.path.join(path, item))] + + return directories + + periodic_table = generate_dictionary_elements() # Reversing the dictionary so the elements become keys and the atomic numbers become values @@ -75,11 +85,15 @@ def __init__(self, dirpath, var_config, energy_per_atom=True, dist=False): self.radius_graph = RadiusGraph(5.0, loop=False, max_num_neighbors=50) - indices = ["pascal", "pbe", "pbe_1d", "pbe_2d", "pbesol", "scan"] + list_dirs = list_directories( + os.path.join(dirpath, "compressed_data", "alexandria.icams.rub.de") + ) - for index in indices: + for index in list_dirs: - subdirpath = os.path.join(dirpath, "compressed_data", index) + subdirpath = os.path.join( + dirpath, "compressed_data", "alexandria.icams.rub.de", index + ) total_file_list = os.listdir(subdirpath) @@ -95,7 +109,7 @@ def __init__(self, dirpath, var_config, energy_per_atom=True, dist=False): if filepath.endswith("bz2"): self.process_file_content(os.path.join(subdirpath, filepath)) else: - print(f"{filepath} is not a .bz2 file to decompress") + print(f"{filepath} is not a .bz2 file to decompress", flush=True) def get_data_dict(self, computed_entry_dict): """ @@ -124,15 +138,15 @@ def get_magmoms_array_from_structure(structure): assert pos.shape[1] == 3, "pos tensor does not have 3 coordinates per atom" assert pos.shape[0] > 0, "pos tensor does not have any atoms" except: - print(f"Structure {entry_id} does not have positional sites") + print(f"Structure {entry_id} does not have positional sites", flush=True) return data_object - natoms = torch.IntTensor([pos.shape[1]]) + natoms = torch.IntTensor([pos.shape[0]]) cell = None try: cell = torch.tensor(structure["lattice"]["matrix"]).to(torch.float32) except: - print(f"Structure {entry_id} does not have cell") + print(f"Structure {entry_id} does not have cell", flush=True) return data_object atomic_numbers = None @@ -151,14 +165,17 @@ def get_magmoms_array_from_structure(structure): pos.shape[0] == atomic_numbers.shape[0] ), f"pos.shape[0]:{pos.shape[0]} does not match with atomic_numbers.shape[0]:{atomic_numbers.shape[0]}" except: - print(f"Structure {entry_id} does not have positional atomic numbers") + print( + f"Structure {entry_id} does not have positional atomic numbers", + flush=True, + ) return data_object forces_numpy = None try: forces_numpy = get_forces_array_from_structure(structure) except: - print(f"Structure {entry_id} does not have forces") + print(f"Structure {entry_id} does not have forces", flush=True) return data_object forces = torch.tensor(forces_numpy).to(torch.float32) @@ -173,7 +190,7 @@ def get_magmoms_array_from_structure(structure): try: total_energy = computed_entry_dict["data"]["energy_total"] except: - print(f"Structure {entry_id} does not have total energy") + print(f"Structure {entry_id} does not have total energy", flush=True) return data_object total_energy_tensor = ( torch.tensor(total_energy).unsqueeze(0).unsqueeze(1).to(torch.float32) @@ -288,8 +305,12 @@ def process_file_content(self, filepath): self.dataset.extend(filtered_computed_entry_dict) except OSError as e: - print("Failed to decompress data:", e) + print("Failed to decompress data:", e, flush=True) decompressed_data = None + except json.JSONDecodeError as e: + print("Failed to decode JSON:", e, flush=True) + except Exception as e: + print("An error occurred:", e, flush=True) def len(self): return len(self.dataset)