Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/steven/gpfa checkpoints #99

Merged
merged 4 commits into from
Oct 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions flare/gp.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
"""Gaussian process model of the Born Oppenheimer potential energy surface."""
import math
from typing import List, Callable
import pickle
import json

import numpy as np

from typing import List, Callable
from scipy.linalg import solve_triangular
from scipy.optimize import minimize

from flare.env import AtomicEnvironment
from flare.struc import Structure
from flare.gp_algebra import get_ky_and_hyp, get_like_grad_from_mats, \
get_neg_likelihood, get_neg_like_grad, get_ky_and_hyp_par
from flare.kernels import str_to_kernel
from flare.mc_simple import str_to_mc_kernel

from flare.util import NumpyEncoder

class GaussianProcess:
""" Gaussian Process Regression Model.

Implementation is based on Algorithm 2.1 (pg. 19) of
"Gaussian Processes for Machine Learning" by Rasmussen and Williams"""

Expand Down Expand Up @@ -59,7 +63,6 @@ def __init__(self, kernel: Callable,
def update_db(self, struc: Structure, forces: list,
custom_range: List[int] = ()):
"""Given structure and forces, add to training set.

:param struc: structure to add to db
:type struc: Structure
:param forces: list of corresponding forces to add to db
Expand All @@ -86,7 +89,6 @@ def add_one_env(self, env: AtomicEnvironment,
force, train: bool = False, **kwargs):
"""
Tool to add a single environment / force pair into the training set.

:param force:
:param env:
:param force: (x,y,z) component associated with environment
Expand All @@ -103,7 +105,6 @@ def add_one_env(self, env: AtomicEnvironment,
@staticmethod
def force_list_to_np(forces: list):
""" Convert list of forces to numpy array of forces.

:param forces: list of forces to convert
:type forces: list<float>
:return: numpy array forces
Expand Down Expand Up @@ -203,7 +204,6 @@ def predict(self, x_t: AtomicEnvironment, d: int) -> [float, float]:

def predict_local_energy(self, x_t: AtomicEnvironment) -> float:
"""Predict the local energy of an atomic environment.

:param x_t: Atomic environment of test atom.
:type x_t: AtomicEnvironment
:return: local energy in eV (up to a constant).
Expand Down Expand Up @@ -238,7 +238,6 @@ def get_kernel_vector(self, x: AtomicEnvironment,
"""
Compute kernel vector, comparing input environment to all environments
in the GP's training set.

:param x: data point to compare against kernel matrix
:type x: AtomicEnvironment
:param d_1: Cartesian component of force vector to get (1=x,2=y,3=z)
Expand Down Expand Up @@ -427,3 +426,27 @@ def from_dict(dictionary):
new_gp.likelihood_gradient = dictionary['likelihood_gradient']

return new_gp

def write_model(self, name: str, format: str = 'json'):
"""
Write model in a variety of formats to a file for later re-use.

:param name: Output name
:param format:
:return:
"""

supported_formats = ['json', 'pickle', 'binary']

if format.lower() == 'json':
with open(name, 'w') as f:
json.dump(self.as_dict(), f, cls=NumpyEncoder)

elif format.lower() == 'pickle' or format.lower() == 'binary':
with open(name, 'wb') as f:
pickle.dump(self, f)

else:
raise ValueError("Output format not supported: try from "
"{}".format(supported_formats))

47 changes: 35 additions & 12 deletions flare/gp_from_aimd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@
MD engine run.
"""
import time
from typing import List, Tuple
from copy import deepcopy

import json
import pickle
import numpy as np

from typing import List, Tuple
from copy import deepcopy

from flare.output import Output
from flare.struc import Structure, get_unique_species
from flare.struc import Structure
from flare.gp import GaussianProcess
from flare.env import AtomicEnvironment
from flare.util import Z_to_element
from flare.util import Z_to_element, element_to_Z
from flare.predict import predict_on_structure, \
predict_on_structure_par, predict_on_structure_en, \
predict_on_structure_par_en
Expand All @@ -37,7 +38,9 @@ def __init__(self, frames: List[Structure],
pre_train_seed_frames: List[Structure] = None,
pre_train_seed_envs: List[Tuple[AtomicEnvironment,
np.array]] = None,
pre_train_atoms_per_element: dict = None):
pre_train_atoms_per_element: dict = None,
checkpoint_interval: int = None,
model_format: str = 'json'):
"""
Class which trains a GP off of an AIMD trajectory, and generates
error statistics between the DFT and GP calls.
Expand All @@ -57,12 +60,14 @@ def __init__(self, frames: List[Structure],
:param n_cpus: Number of CPUs to parallelize over
:param shuffle_frames: Randomize order of frames for better training
:param verbose: 0: Silent, 1: Minimal, 2: Lots of information
:param model_write: Write output model here
:param model_write: Where to write output model
:param pre_train_on_skips: Train model on every n frames before running
:param pre_train_seed_frames: Frames to train on before running
:param pre_train_seed_envs: Environments to train on before running
:param pre_train_atoms_per_element: Max # of environments to add from
each species in the seed pre-training steps
:param checkpoint_interval: How often to write model after trainings
:param model_format: Format to write GP model to
"""

self.frames = frames
Expand Down Expand Up @@ -97,15 +102,26 @@ def __init__(self, frames: List[Structure],

# To later be filled in using the time library
self.start_time = None
self.pickle_name = model_write

self.pre_train_on_skips = pre_train_on_skips
self.seed_envs = [] if pre_train_seed_envs is None else \
pre_train_seed_envs
self.seed_frames = [] if pre_train_seed_frames is None \
else pre_train_seed_frames
self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
is None else pre_train_atoms_per_element
is None else pre_train_atoms_per_element

# Convert to Coded Species
if self.pre_train_env_per_species:
pre_train_species = list(self.pre_train_env_per_species.keys())
for key in pre_train_species:
self.pre_train_env_per_species[element_to_Z(key)] = \
self.pre_train_env_per_species[key]

# Output parameters
self.checkpoint_interval = checkpoint_interval
self.model_write = model_write
self.model_format = model_format

def pre_run(self):
"""
Expand Down Expand Up @@ -244,9 +260,8 @@ def run(self):

self.output.conclude_run()

if self.pickle_name:
with open(self.pickle_name, 'wb') as f:
pickle.dump(self.gp, f)
if self.model_write:
self.gp.write_model(self.model_write, self.model_format)

def update_gp_and_print(self, frame: Structure, train_atoms: List[int],
train: bool=True):
Expand Down Expand Up @@ -284,6 +299,11 @@ def train_gp(self):
self.gp.likelihood, self.gp.likelihood_gradient)
self.train_count += 1

if self.checkpoint_interval \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool. This is nice. Maybe we should also add a variable "last_step_written" to record when the gp was dumped. So it won't dump twice at the end of the run.

and self.train_count % self.checkpoint_interval == 0 \
and self.model_write:
self.gp.write_model(self.model_write, self.model_format)

def is_std_in_bound(self, frame: Structure)->(bool, List[int]):
"""
If the predicted variance is too high, returns a list of atoms
Expand Down Expand Up @@ -324,3 +344,6 @@ def is_std_in_bound(self, frame: Structure)->(bool, List[int]):
return False, target_atoms
else:
return True, [-1]



32 changes: 32 additions & 0 deletions tests/test_gp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import pytest
import pickle
import os
import json
import numpy as np

from pytest import raises

from flare.gp import GaussianProcess
from flare.env import AtomicEnvironment
from flare.struc import Structure
Expand Down Expand Up @@ -256,3 +262,29 @@ def test_serialization_method(two_body_gp, test_point):
for d in [0, 1, 2]:
assert np.all(two_body_gp.predict(x_t=test_point, d=d) ==
new_gp.predict(x_t=test_point, d=d))


def test_load_and_reload(two_body_gp, test_point):
nw13slx marked this conversation as resolved.
Show resolved Hide resolved

two_body_gp.write_model('two_body.pickle', 'pickle')

with open('two_body.pickle', 'rb') as f:
new_gp = pickle.load(f)

for d in [0, 1, 2]:
assert np.all(two_body_gp.predict(x_t=test_point, d=d) ==
new_gp.predict(x_t=test_point, d=d))
os.remove('two_body.pickle')

two_body_gp.write_model('two_body.json')
with open('two_body.json', 'r') as f:
new_gp = GaussianProcess.from_dict(json.loads(f.readline()))
for d in [0, 1, 2]:
assert np.all(two_body_gp.predict(x_t=test_point, d=d) ==
new_gp.predict(x_t=test_point, d=d))
os.remove('two_body.json')

with raises(ValueError):
two_body_gp.write_model('two_body.pickle', 'cucumber')


23 changes: 18 additions & 5 deletions tests/test_gp_from_aimd.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest
import numpy as np
import sys
from flare.gp import GaussianProcess
import pickle

from flare.env import AtomicEnvironment
from flare.struc import Structure
from flare.kernels import two_plus_three_body, two_plus_three_body_grad
Expand Down Expand Up @@ -70,6 +70,7 @@ def test_load_trained_gp_and_run(methanol_gp):
os.system('rm ./gp_from_aimd.xyz')
os.system('rm ./gp_from_aimd-f.xyz')


def test_load_one_frame_and_run():
the_gp = GaussianProcess(kernel=two_plus_three_body_mc,
kernel_grad=two_plus_three_body_mc_grad,
Expand Down Expand Up @@ -123,15 +124,27 @@ def test_seed_and_run():
skip=15,
pre_train_seed_envs=seeds,
pre_train_seed_frames=[frames[-1]],
model_write='test_methanol_gp.pickle',
max_atoms_from_frame=4)
max_atoms_from_frame=4,
model_write='meth_test.pickle',
model_format='pickle',
checkpoint_interval=1,
pre_train_atoms_per_element={'H': 1})

tt.run()

with open('meth_test.pickle', 'rb') as f:
new_gp = pickle.load(f)

test_env = envs[0]

for d in [0, 1, 2]:
assert np.all(the_gp.predict(x_t=test_env, d=d) ==
new_gp.predict(x_t=test_env, d=d))

os.system('rm ./gp_from_aimd.out')
os.system('rm ./gp_from_aimd.xyz')
os.system('rm ./gp_from_aimd-f.xyz')
os.system('rm ./test_methanol_gp.pickle')
os.system('rm ./meth_test.pickle')


def test_uncertainty_threshold(fake_gp):
Expand Down