diff --git a/flare/gp.py b/flare/gp.py index c35bf72f4..f322ac5c8 100644 --- a/flare/gp.py +++ b/flare/gp.py @@ -929,7 +929,7 @@ def training_statistics(self) -> dict: env.atom])) # Summarize the relevant information - data['species'] = set(present_species) + data['species'] = list(set(present_species)) data['envs_by_species'] = dict(Counter(present_species)) return data diff --git a/flare/gp_from_aimd.py b/flare/gp_from_aimd.py index fe108abbc..55cbc1e5f 100644 --- a/flare/gp_from_aimd.py +++ b/flare/gp_from_aimd.py @@ -19,8 +19,8 @@ individual species. If you are studying a system where the dynamics of one species are -particularly important and so you want a good representation in the training set, -then you would want to include as many as possible in the training set +particularly important and so you want a good representation in the training +set, then you would want to include as many as possible in the training set during the seed part of the training. Inversely, if a system has high representation of a species well-described @@ -32,25 +32,24 @@ of atoms which are added from a given seed frame. """ +import json as json import time +import warnings from copy import deepcopy +from math import inf from typing import List, Tuple, Union import numpy as np -from math import inf -import warnings - from flare.env import AtomicEnvironment from flare.gp import GaussianProcess +from flare.mgp.mgp_en import MappedGaussianProcess +from flare.mgp.otf import predict_on_structure_mgp from flare.output import Output -from flare.predict import predict_on_atom, predict_on_atom_en, \ - predict_on_structure_par, predict_on_structure_par_en +from flare.predict import predict_on_structure_par, predict_on_structure_par_en from flare.struc import Structure from flare.util import element_to_Z, \ is_std_in_bound_per_species, is_force_in_bound_per_species, \ - Z_to_element, subset_of_frame_by_element -from flare.mgp.otf import predict_on_structure_mgp -from flare.mgp.mgp_en import MappedGaussianProcess + Z_to_element, subset_of_frame_by_element, NumpyEncoder class TrajectoryTrainer: @@ -107,7 +106,8 @@ def __init__(self, frames: List[Structure], :param min_atoms_per_train: Only train when this many atoms have been added :param max_trains: Stop training GP after this many calls to train - :param n_cpus: Number of CPUs to parallelize over for parallelization over atoms + :param n_cpus: Number of CPUs to parallelize over for parallelization + over atoms :param shuffle_frames: Randomize order of frames for better training :param verbose: 0: Silent, NO output written or printed at all. 1: Minimal, @@ -182,7 +182,7 @@ def __init__(self, frames: List[Structure], assert (isinstance(skip, int) and skip >= 1), "Skip needs to be a " \ "positive integer." self.validate_ratio = validate_ratio - assert (validate_ratio >= 0 and validate_ratio <= 1), \ + assert (0 <= validate_ratio <= 1), \ "validate_ratio needs to be [0,1]" # Set up for pretraining @@ -194,7 +194,7 @@ def __init__(self, frames: List[Structure], else pre_train_seed_frames self.pre_train_env_per_species = {} if pre_train_atoms_per_element \ - is None else pre_train_atoms_per_element + is None else pre_train_atoms_per_element self.train_env_per_species = {} if train_atoms_per_element \ is None else train_atoms_per_element @@ -234,7 +234,7 @@ def pre_run(self): """ if self.mgp: - raise NotImplementedError("Pre-running not" \ + raise NotImplementedError("Pre-running not" "yet configured for MGP") if self.verbose: self.output.write_header(self.gp.cutoffs, @@ -243,11 +243,16 @@ def pre_run(self): self.gp.opt_algorithm, dt=0, Nsteps=len(self.frames), - structure=self.frames[0], + structure=None, std_tolerance=(self.rel_std_tolerance, self.abs_std_tolerance), optional={ - 'GP Statistics': self.gp.training_statistics}) + 'GP Statistics': + json.dumps( + self.gp.training_statistics), + 'GP Name': self.gp.name, + 'GP Write Name': + self.output_name + "_model." + self.model_format}) self.start_time = time.time() if self.verbose >= 3: @@ -299,9 +304,11 @@ def pre_run(self): train_atoms=train_atoms, uncertainties=[], train=False) - if self.verbose >= 3 and atom_count > 0: - self.output.write_to_log(f"Added {atom_count} atoms to pretrain\n" \ - f"In total {len(self.gp.training_data)} atoms", + if self.verbose and atom_count > 0: + self.output.write_to_log(f"Added {atom_count} atoms to " + f"pretrain.\n" + f"Pre-run GP Statistics: " + f"{json.dumps(self.gp.training_statistics)} \n", flush=True) if (self.seed_envs or atom_count or self.seed_frames) and \ @@ -338,13 +345,13 @@ def run(self): # Past this frame, stop adding atoms to the training set # (used for validation of model) - train_frame = int(len(self.frames[::self.skip]) * (1 - - self.validate_ratio)) + train_frame = int(len(self.frames[::self.skip]) + * (1 - self.validate_ratio)) # Loop through trajectory. - cur_atoms_added_train = 0 # Track atoms added for training - cur_atoms_added_write = 0 # Track atoms added for writing - cur_trains_done_write = 0 # Track training done for writing + cur_atoms_added_train = 0 # Track atoms added for training + cur_atoms_added_write = 0 # Track atoms added for writing + cur_trains_done_write = 0 # Track training done for writing for i, cur_frame in enumerate(self.frames[::self.skip]): @@ -354,7 +361,7 @@ def run(self): # If no predict_atoms_per_element was specified, predict_atoms # will be equal to every atom in the frame. predict_atoms = subset_of_frame_by_element(cur_frame, - self.predict_atoms_per_element) + self.predict_atoms_per_element) # Atoms which are skipped will have NaN as their force / std values local_energies = None @@ -447,20 +454,20 @@ def run(self): else: self.gp.update_L_alpha() - # Loop to decide of a model should be written this # iteration will_write = False if self.train_checkpoint_interval and \ - cur_trains_done_write and\ + cur_trains_done_write and \ self.train_checkpoint_interval \ % cur_trains_done_write == 0: will_write = True cur_trains_done_write = 0 - if self.atom_checkpoint_interval and cur_atoms_added_write\ - and self.atom_checkpoint_interval \ + if self.atom_checkpoint_interval \ + and cur_atoms_added_write \ + and self.atom_checkpoint_interval \ % cur_atoms_added_write == 0: will_write = True cur_atoms_added_write = 0 @@ -480,7 +487,7 @@ def run(self): self.model_format) def update_gp_and_print(self, frame: Structure, train_atoms: List[int], - uncertainties: List[int]=None, + uncertainties: List[int] = None, train: bool = True): """ Update the internal GP model training set with a list of training @@ -488,13 +495,11 @@ def update_gp_and_print(self, frame: Structure, train_atoms: List[int], the GP by optimizing hyperparameters. :param frame: Structure to train on :param train_atoms: Index atoms to train on - :param: uncertainties: Uncertainties to print, pass in [] to silence + :param uncertainties: Uncertainties to print, pass in [] to silence :param train: Train or not :return: None """ - - # Group added atoms by species for easier output added_species = [Z_to_element(frame.coded_species[at]) for at in train_atoms] @@ -503,12 +508,12 @@ def update_gp_and_print(self, frame: Structure, train_atoms: List[int], for atom, spec in zip(train_atoms, added_species): added_atoms[spec].append(atom) - if self.verbose: - self.output.write_to_log(f'\nAdding atom(s) {added_atoms}' + self.output.write_to_log('\nAdding atom(s) ' + f'{json.dumps(added_atoms,cls=NumpyEncoder)}' ' to the training set.\n') - if uncertainties is None or len(uncertainties)!=0: + if uncertainties is None or len(uncertainties) != 0: uncertainties = frame.stds[train_atoms] if self.verbose and len(uncertainties) != 0: @@ -559,3 +564,139 @@ def train_gp(self, max_iter: int = None): self.gp.likelihood_gradient, hyps_mask=self.gp.hyps_mask) self.train_count += 1 + + +def parse_trajectory_trainer_output(file: str, return_gp_data: bool = False) \ + -> Union[List[dict], Tuple[List[dict], dict]]: + """ + Reads output of a TrajectoryTrainer run by frame. return_gp_data returns + data about GP model growth useful for visualizing progress of model + training. + + :param file: filename of output + :param return_gp_data: flag for returning extra GP data + :return: List of dictionaries with keys 'species', 'positions', + 'gp_forces', 'dft_forces', 'gp_stds', 'added_atoms', and + 'maes_by_species', optionally, gp_data dictionary + """ + + with open(file, 'r') as f: + lines = f.readlines() + num_lines = len(lines) + + # Get indexes where frames begin, and include the index of the final line + frame_indexes = [i for i in range(num_lines) if '-Frame:' in + lines[i]] + [num_lines] + + frames = [] + + # Central parsing loop + for n in range(len(frame_indexes) - 1): + # Start at +2 to skip frame marker and header of table of data + # Set up values for current frame which will be populated + + frame_atoms = [] + frame_positions = [] + gp_forces = [] + dft_forces = [] + stds = [] + added_atoms = {} + frame_species_maes = {} + + # i loops through individual atom's info + for i in range(frame_indexes[n] + 2, frame_indexes[n + 1]): + + # Lines with data will be long; stop when at end of atom data + if len(lines[i]) > 10: + split = lines[i].split() + + frame_atoms.append(split[0]) + + frame_positions.append([float(split[1]), float(split[2]), + float(split[3])]) + gp_forces.append([float(split[4]), float(split[5]), + float(split[6])]) + stds.append( + [float(split[7]), float(split[8]), float(split[9])]) + + dft_forces.append([float(split[10]), float(split[11]), + float(split[12])]) + + # Terminate at blank line between results + else: + break + # Loop through information in frame after Data + for i in range(frame_indexes[n] + len(frame_positions) + 2, + frame_indexes[n + 1]): + + if 'Adding atom(s)' in lines[i]: + # Splitting to target the 'added atoms' substring + split_line = lines[i][15:-21] + added_atoms = json.loads(split_line.strip()) + + if 'type ' in lines[i]: + cur_line = lines[i].split() + frame_species_maes[cur_line[1]] = float(cur_line[3]) + + cur_frame_stats = {'species': frame_atoms, + 'positions': frame_positions, + 'gp_forces': gp_forces, + 'dft_forces': dft_forces, + 'gp_stds': stds, + 'added_atoms': added_atoms, + 'maes_by_species': frame_species_maes} + + frames.append(cur_frame_stats) + + if not return_gp_data: + return frames + + # Compute information about GP training + # to study GP growth and performance over trajectory + + gp_stats_line = [line for line in lines[:30] if 'GP Statistics' in + line and 'Pre-run' not in line][0][15:].strip() + + initial_gp_statistics = json.loads(gp_stats_line) + + # Get pre_run statistics (if pre-run was done): + pre_run_gp_statistics = None + pre_run_gp_stats_line = [line for line in lines if 'Pre-run GP' in line] + if pre_run_gp_stats_line: + pre_run_gp_statistics = json.loads(pre_run_gp_stats_line[0][ + 22:].strip()) + + # Compute cumulative GP size + cumulative_gp_size = [int(initial_gp_statistics['N'])] + + if pre_run_gp_stats_line: + cumulative_gp_size.append(int(pre_run_gp_statistics['N'])) + + running_total = cumulative_gp_size[-1] + + for frame in frames: + + added_atom_dict = frame['added_atoms'] + for val in added_atom_dict.values(): + running_total += len(val) + cumulative_gp_size.append(running_total) + + # Compute MAEs for each element over time + all_species = set() + for frame in frames: + all_species = all_species.union(set(frame['species'])) + + all_species = list(all_species) + mae_by_elt = {elt: [] for elt in all_species} + + for frame in frames: + for elt in all_species: + cur_mae = frame['maes_by_species'].get(elt, np.nan) + mae_by_elt[elt].append(cur_mae) + + gp_data = {'init_stats': initial_gp_statistics, + 'pre_train_stats': pre_run_gp_statistics, + 'cumulative_gp_size': cumulative_gp_size, + 'mae_by_elt': mae_by_elt} + + return frames, gp_data diff --git a/flare/output.py b/flare/output.py index 0866a2567..29e644fb6 100644 --- a/flare/output.py +++ b/flare/output.py @@ -7,11 +7,13 @@ import os import shutil import time - import multiprocessing + import numpy as np +from typing import Union from flare.util import Z_to_element +from flare.struc import Structure class Output: @@ -91,12 +93,13 @@ def write_to_log(self, logstring: str, name: str = "log", self.outfiles[name].flush() def write_header(self, cutoffs, kernel_name: str, - hyps, algo: str, dt: float, - Nsteps: int, structure, - std_tolerance, + hyps, algo: str, dt: float = None, + Nsteps: int = None, structure: Structure= None, + std_tolerance: Union[float, int] = None, optional: dict = None): """ - Write header to the log function + Write header to the log function. Designed for Trajectory Trainer and + OTF runs and can take flexible input for both. :param cutoffs: GP cutoffs :param kernel_name: Kernel names @@ -104,7 +107,7 @@ def write_header(self, cutoffs, kernel_name: str, :param algo: algorithm for hyper parameter optimization :param dt: timestep for OTF MD :param Nsteps: total number of steps for OTF MD - :param structure: the atomic structure + :param structure: initial structure :param std_tolerance: tolarence for active learning :param optional: a dictionary of all the other parameters """ @@ -114,7 +117,7 @@ def write_header(self, cutoffs, kernel_name: str, if isinstance(std_tolerance, tuple): std_string = 'relative uncertainty tolerance: ' \ - f'{std_tolerance[0]} eV/A\n' + f'{std_tolerance[0]} times noise hyperparameter \n' std_string += 'absolute uncertainty tolerance: ' \ f'{std_tolerance[1]} eV/A\n' elif std_tolerance < 0: @@ -122,7 +125,8 @@ def write_header(self, cutoffs, kernel_name: str, f'uncertainty tolerance: {np.abs(std_tolerance)} eV/A\n' elif std_tolerance > 0: std_string = \ - f'uncertainty tolerance: {np.abs(std_tolerance)} times noise \n' + f'uncertainty tolerance: {np.abs(std_tolerance)} ' \ + 'times noise hyperparameter \n' else: std_string = '' @@ -135,24 +139,27 @@ def write_header(self, cutoffs, kernel_name: str, headerstring += f'hyperparameters: {str(hyps)}\n' headerstring += f'hyperparameter optimization algorithm: {algo}\n' headerstring += std_string - headerstring += f'timestep (ps): {dt}\n' + if dt is not None: + headerstring += f'timestep (ps): {dt}\n' headerstring += f'number of frames: {Nsteps}\n' - headerstring += f'number of atoms: {structure.nat}\n' - headerstring += f'system species: {set(structure.species_labels)}\n' - headerstring += 'periodic cell: \n' - headerstring += str(structure.cell)+'\n' + if structure is not None: + headerstring += f'number of atoms: {structure.nat}\n' + headerstring += f'system species: {set(structure.species_labels)}\n' + headerstring += 'periodic cell: \n' + headerstring += str(structure.cell)+'\n' if optional: for key, value in optional.items(): headerstring += f"{key}: {value} \n" # report previous positions - headerstring += '\nprevious positions (A):\n' - for i in range(len(structure.positions)): - headerstring += f'{structure.species_labels[i]:5}' - for j in range(3): - headerstring += f'{structure.prev_positions[i][j]:10.4f}' - headerstring += '\n' + if structure is not None: + headerstring += '\nprevious positions (A):\n' + for i in range(len(structure.positions)): + headerstring += f'{structure.species_labels[i]:5}' + for j in range(3): + headerstring += f'{structure.prev_positions[i][j]:10.4f}' + headerstring += '\n' headerstring += '-' * 80 + '\n' f.write(headerstring) diff --git a/tests/test_files/gpfa_parse_test.out b/tests/test_files/gpfa_parse_test.out new file mode 100644 index 000000000..1c7fdbcd4 --- /dev/null +++ b/tests/test_files/gpfa_parse_test.out @@ -0,0 +1,133 @@ +2020-05-18 18:05:17.836559 +number of cpu cores: 4 +cutoffs: [7. 3.] +kernel_name: two_plus_three_body_mc +number of hyperparameters: 5 +hyperparameters: [3.75996759e-06 1.53990678e-02 2.50624782e-05 5.07884426e-01 + 1.70172923e-03] +hyperparameter optimization algorithm: L-BFGS-B +relative uncertainty tolerance: 0 times noise hyperparameter +absolute uncertainty tolerance: 0 eV/A +timestep (ps): 0 +number of frames: 24 +GP Statistics: {"N": 0, "species": [], "envs_by_species": {}} +GP Name: default_gp +GP Write Name: meth_test_model.json +-------------------------------------------------------------------------------- + +Adding atom(s) {"H": [3], "O": [1], "C": [0]} to the training set. +Added 3 atoms to pretrain. +Pre-run GP Statistics: {"N": 9, "species": ["H", "C", "O"], "envs_by_species": {"C": 2, "O": 2, "H": 5}} +Train GP + +GP hyperparameters: +Hyp0 : l2 = 0.0001 +Hyp1 : s2 = 0.0099 +Hyp2 : l3 = 0.0000 +Hyp3 : s3 = 0.5079 +Hyp4 : n0 = 0.0010 +likelihood: 52.3972 +likelihood gradient: [ 1.03585519e+01 -1.17983951e-01 1.95960377e+00 -1.40589820e-06 + -1.33249347e+03] +wall time from start: 7.60 s + +*-Frame: 0 +El Position (A) GP Force (ev/A) Std. Dev (ev/A) DFT Force (ev/A) +C 5.0356 5.3627 11.293 0.0013479 -0.0086617 -0.00014977 0.00083762 0.00090705 0.00083418 0.0010674 -0.0086516 -0.00036775 +O 5.3892 4.055 10.834 nan nan nan nan nan nan -0.0030383 0.011562 0.0011384 +H 6.1001 4.1431 10.176 nan nan nan nan nan nan -0.00041708 0.00012457 0.00026436 +H 4.646 6.0016 10.481 nan nan nan nan nan nan 0.00043879 -0.00056664 -0.0015216 +H 5.8825 5.8788 11.78 nan nan nan nan nan nan 0.0026189 -0.00042729 0.0014758 +H 4.2402 5.2274 12.038 nan nan nan nan nan nan -0.00022832 -0.0015721 -0.00089144 + +mean absolute error: 0.17 meV/A +mean absolute dft component: 2.02 meV/A +mae per species +type C mae: 0.17 meV/A +wall time from start: 0.00048 + +*-Frame: 1 +El Position (A) GP Force (ev/A) Std. Dev (ev/A) DFT Force (ev/A) +C 5.0122 5.3557 11.306 -0.010021 0.037157 0.096695 0.09305 0.17919 0.093806 -0.95492 0.28934 0.48548 +O 5.3379 4.0498 10.847 nan nan nan nan nan nan 0.62334 0.37294 -0.53905 +H 6.1254 4.1741 10.282 nan nan nan nan nan nan -0.11904 -0.39129 -0.035069 +H 4.5732 6.05 10.558 nan nan nan nan nan nan 0.024085 -0.39282 -0.29839 +H 5.8515 5.9125 11.745 nan nan nan nan nan nan 0.21593 -0.20974 0.28283 +H 4.2239 5.1764 12.058 nan nan nan nan nan nan 0.21686 0.32463 0.098931 + +mean absolute error: 528.62 meV/A +mean absolute dft component: 326.37 meV/A +mae per species +type C mae: 528.62 meV/A +wall time from start: 0.00029 + +Adding atom(s) {"C": [0]} to the training set. +Uncertainties: [[0. 0. 0.]]. + +*-Frame: 2 +El Position (A) GP Force (ev/A) Std. Dev (ev/A) DFT Force (ev/A) +C 4.9738 5.3542 11.326 0.44344 -1.1703 -1.2127 0.15577 0.12357 0.094121 0.24856 0.44472 0.43621 +O 5.2943 4.0494 10.854 nan nan nan nan nan nan 0.2585 0.1257 -0.21401 +H 6.1343 4.1254 10.373 nan nan nan nan nan nan 0.037147 -0.17532 -0.15446 +H 4.5078 6.018 10.572 nan nan nan nan nan nan 0.10073 -0.17138 -0.17238 +H 5.8558 5.8985 11.765 nan nan nan nan nan nan -0.4542 -0.38165 -0.2078 +H 4.2382 5.1904 12.113 nan nan nan nan nan nan -0.18326 0.15205 0.30734 + +mean absolute error: 1152.93 meV/A +mean absolute dft component: 234.75 meV/A +mae per species +type C mae: 1152.93 meV/A +wall time from start: 0.00033 + +Adding atom(s) {"C": [0]} to the training set. +Uncertainties: [[0. 0. 0.]]. + +*-Frame: 3 +El Position (A) GP Force (ev/A) Std. Dev (ev/A) DFT Force (ev/A) +C 4.9345 5.3582 11.354 1.2039 -0.63736 -0.49531 0.18101 0.13748 0.10133 -0.56205 -0.73341 0.025774 +O 5.2551 4.0507 10.857 nan nan nan nan nan nan -0.10864 -0.35736 0.11981 +H 6.1354 4.0382 10.439 nan nan nan nan nan nan -0.16378 0.25194 0.092475 +H 4.4634 5.9486 10.55 nan nan nan nan nan nan 0.045248 0.25636 0.18297 +H 5.8113 5.8304 11.763 nan nan nan nan nan nan 0.64355 0.64017 0.1229 +H 4.2287 5.2411 12.211 nan nan nan nan nan nan 0.15043 -0.062927 -0.55067 + +mean absolute error: 794.36 meV/A +mean absolute dft component: 281.69 meV/A +mae per species +type C mae: 794.36 meV/A +wall time from start: 0.00053 + +Adding atom(s) {"C": [0]} to the training set. +Uncertainties: [[0. 0. 0.]]. + +*-Frame: 4 +El Position (A) GP Force (ev/A) Std. Dev (ev/A) DFT Force (ev/A) +C 4.893 5.3526 11.381 -1.7909 -0.79015 0.72632 0.16615 0.12703 0.10462 1.7604 0.18774 -0.6134 +O 5.2135 4.0474 10.862 nan nan nan nan nan nan -0.54736 -0.31382 0.43887 +H 6.1167 4.0035 10.517 nan nan nan nan nan nan 0.10004 0.32068 0.026511 +H 4.4303 5.933 10.566 nan nan nan nan nan nan -0.0076784 0.29726 0.24329 +H 5.8353 5.8618 11.76 nan nan nan nan nan nan -0.84744 -0.19008 -0.3897 +H 4.2246 5.2747 12.229 nan nan nan nan nan nan -0.45225 -0.3082 0.28713 + +mean absolute error: 1956.30 meV/A +mean absolute dft component: 407.32 meV/A +mae per species +type C mae: 1956.30 meV/A +wall time from start: 0.0005 + +Adding atom(s) {"C": [0]} to the training set. +Uncertainties: [[0. 0. 0.]]. +Train GP + +GP hyperparameters: +Hyp0 : l2 = 0.0123 +Hyp1 : s2 = 0.0099 +Hyp2 : l3 = 0.0000 +Hyp3 : s3 = 0.5079 +Hyp4 : n0 = 0.0010 +likelihood: -96.5194 +likelihood gradient: [-2.66642712e+03 5.64685351e+03 -5.10617254e-04 1.87489025e-09 + -1.81160621e+03] +wall time from start: 8.20 s +-------------------- +Run complete. diff --git a/tests/test_gp_from_aimd.py b/tests/test_gp_from_aimd.py index ce77f50b2..e380811cb 100644 --- a/tests/test_gp_from_aimd.py +++ b/tests/test_gp_from_aimd.py @@ -9,7 +9,9 @@ from flare.struc import Structure from flare.gp import GaussianProcess from flare.mgp.mgp_en import MappedGaussianProcess -from flare.gp_from_aimd import TrajectoryTrainer, subset_of_frame_by_element +from flare.gp_from_aimd import TrajectoryTrainer,\ + parse_trajectory_trainer_output +from flare.util import subset_of_frame_by_element from json import loads from flare.env import AtomicEnvironment from .test_mgp_unit import all_mgp, all_gp, get_random_structure @@ -255,3 +257,43 @@ def test_mgp_gpfa(all_mgp, all_gp): abs_std_tolerance=0, abs_force_tolerance=0) assert tt.mgp is True tt.run() + + +def test_parse_gpfa_output(): + """ + Compare parsing against known answers. + :return: + """ + frames, gp_data = parse_trajectory_trainer_output( + './test_files/gpfa_parse_test.out', True) + + assert len(frames) == 5 + assert isinstance(frames[0], dict) + for frame in frames: + for key in ['species', 'positions', 'gp_forces', 'dft_forces', + 'gp_stds']: + + assert len(frame[key]) == 6 + + assert len(frame['added_atoms']) == 0 or len(frame['added_atoms']) == 1 + + assert frame['maes_by_species']['C'] + assert frame['maes_by_species'].get('H') is None + + + assert gp_data['init_stats']['N'] == 0 + assert gp_data['init_stats']['species'] == [] + assert gp_data['init_stats']['envs_by_species'] == {} + + assert gp_data['cumulative_gp_size'][-1] > 2 + assert len(gp_data['mae_by_elt']['C']) == 5 + + assert gp_data['pre_train_stats']['N'] == 9 + assert gp_data['pre_train_stats']['envs_by_species']['C'] == 2 + assert gp_data['pre_train_stats']['envs_by_species']['H'] == 5 + assert gp_data['pre_train_stats']['envs_by_species']['O'] == 2 + assert gp_data['pre_train_stats']['species'] == ['H', 'C', 'O'] + + assert gp_data['cumulative_gp_size'] == [0, 9, 9, 10, 11, 12, 13] + +