Skip to content

Commit

Permalink
Qmlearn (#82)
Browse files Browse the repository at this point in the history
* Made base representations

* started CM and data class

* Working on generate routine

* Working basic example

* Mostly hacked the searchcv routines to work

* Implementing atomic gaussian kernel

* working atomic krr

* Restructure and started global slatm

* Slatm

* Started acsf

* stash before merging acsf bugfix

* acsf bugfix cherrypick

* sigma='auto' option added to kernels

* Started fchl

* Working fchl

* Started preprocessing

* Mostly working atom scaler

* Made several attributes private

* Restructured how the data object is passed, to avoid possible memory issues

* Started alchemy in kernels

* Minor change to kernel alchemy

* Working feature trick in kernels

* Cleaned up code

* daily

* Finished examples
  • Loading branch information
larsbratholm authored Sep 10, 2018
1 parent 2f47e85 commit 9143138
Show file tree
Hide file tree
Showing 30 changed files with 2,621 additions and 81 deletions.
11 changes: 11 additions & 0 deletions docs/source/qml.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,14 @@ qml\.aglaia module
:inherited-members:


qml\.qmlearn.representations module
------------------

.. automodule:: qml.qmlearn.representations
:inherited-members:

qml\.qmlearn.kernels module
------------------

.. automodule:: qml.qmlearn.kernels
:inherited-members:
287 changes: 287 additions & 0 deletions examples/qmlearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
import glob
import numpy as np
from qml import qmlearn
import sklearn.pipeline
import sklearn.model_selection

def data():
"""
Using the Data object.
"""
print("*** Begin data examples ***")

# The Data object has the same role as the Compound class.
# Where the Compound class is for one compound, the Data class
# Is for multiple

# One can load in a set of xyz files
filenames = sorted(glob.glob("../test/qm7/00*.xyz"))
data = qmlearn.Data(filenames)
print("length of filenames", len(filenames))
print("length of nuclear_charges", len(data.nuclear_charges))
print("length of coordinates", len(data.coordinates))

# Or just load a glob string
data = qmlearn.Data("../test/qm7/00*.xyz")
print("length of nuclear_charges", len(data.nuclear_charges))

# Energies (or other molecular properties) can be stored in the object
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)[:98]
data.set_energies(energies)
print("length of energies", len(data.energies))

print("*** End data examples ***")
print()

def preprocessing():
"""
Rescaling energies
"""

print("*** Begin preprocessing examples ***")

# The AtomScaler object does a linear fit of the number of each element to the energy.
data = qmlearn.Data("../test/qm7/*.xyz")
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)

# Input can be nuclear_charges and energies
print("Energies before rescaling", energies[:3])
rescaled_energies = qmlearn.preprocessing.AtomScaler().fit_transform(data.nuclear_charges, energies)
print("Energies after rescaling", rescaled_energies[:3])

# Or a data object can be used
data.set_energies(energies)
data2 = qmlearn.preprocessing.AtomScaler().fit_transform(data)
print("Energies after rescaling", data2.energies[:3])

print("*** End preprocessing examples ***")
print()

def representations():
"""
Creating representations. Currently implemented representations are
CoulombMatrix, AtomicCoulombMatrix, AtomicSLATM, GlobalSLATM,
FCHLRepresentations, AtomCenteredSymmetryFunctions.
(BagOfBonds is still missing)
"""

print("*** Begin representations examples ***")

data = qmlearn.Data("../test/qm7/*.xyz")

# Representations can be created from a data object
model = qmlearn.representations.CoulombMatrix(sorting ='row-norm')
representations = model.generate(data)
print("Shape of representations:", representations.shape)

# Alternatively the data object can be passed at initialization of the representation class
# and only select molecule indices can be parsed

model = qmlearn.representations.CoulombMatrix(data)
representations = model.generate([0,5,7,16])
print("Shape of representations:", representations.shape)

print("*** End representations examples ***")
print()

def kernels():
"""
Create kernels. Currently implemented kernels are GaussianKernel,
LaplacianKernel, FCHLKernel.
"""

print("*** Begin kernels examples ***")

data = qmlearn.Data("../test/qm7/*.xyz")
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
data.set_energies(energies)

# Kernels can be created from representations
model = qmlearn.representations.CoulombMatrix(data)
indices = np.arange(100)
representations = model.generate(indices)

model = qmlearn.kernels.GaussianKernel(sigma='auto')
symmetric_kernels = model.generate(representations[:80])
print("Shape of symmetric kernels:", symmetric_kernels.shape)

asymmetric_kernels = model.generate(representations[:80], representations[80:])
print("Shape of asymmetric kernels:", asymmetric_kernels.shape)

# Atomic representations can be used as well
model = qmlearn.representations.AtomicCoulombMatrix(data)
indices = np.arange(100)
representations = model.generate(indices)

model = qmlearn.kernels.GaussianKernel(sigma='auto')
symmetric_kernels = model.generate(representations[:80], representation_type = 'atomic')
print("Shape of symmetric kernels:", symmetric_kernels.shape)

asymmetric_kernels = model.generate(representations[:80], representations[80:], representation_type = 'atomic')
print("Shape of asymmetric kernels:", asymmetric_kernels.shape)

print("*** End kernels examples ***")
print()

def models():
"""
Regression models. Only KernelRidgeRegression implemented so far.
"""

print("*** Begin models examples ***")

filenames = sorted(glob.glob("../test/qm7/*.xyz"))
data = qmlearn.Data(filenames)
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
model = qmlearn.representations.CoulombMatrix(data)
# Create 1000 random indices
indices = np.arange(1000)
np.random.shuffle(indices)

representations = model.generate(indices)
model = qmlearn.kernels.GaussianKernel(sigma='auto')
symmetric_kernels = model.generate(representations[:800])
asymmetric_kernels = model.generate(representations[:800], representations[800:])

# Model can be fit giving kernel matrix and energies

model = qmlearn.models.KernelRidgeRegression()
model.fit(symmetric_kernels, energies[indices[:800]])
print("Fitted KRR weights:", model.alpha[:3])

# Predictions can be had from an asymmetric kernel
predictions = model.predict(asymmetric_kernels)
print("Predicted energies:", predictions[:3])
print("True energies:", energies[indices[:3]])

# Or the score (default negative mae) can be had directly
scores = model.score(asymmetric_kernels, energies[indices[800:]])
print("Negative MAE:", scores)

print("*** End models examples ***")
print()

def pipelines():
"""
Constructing scikit-learn pipelines
"""

print("*** Begin pipelines examples ***")

# It is much easier to do all this with a scikit-learn pipeline

# Create data
data = qmlearn.Data("../test/qm7/*.xyz")
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
data.set_energies(energies)

# Create model
model = sklearn.pipeline.make_pipeline(
qmlearn.preprocessing.AtomScaler(data),
qmlearn.representations.CoulombMatrix(),
qmlearn.kernels.GaussianKernel(),
qmlearn.models.KernelRidgeRegression(),
)

# Create 1000 random indices
indices = np.arange(1000)
np.random.shuffle(indices)

model.fit(indices[:800])
scores = model.score(indices[800:])
print("Negative MAE:", scores)

# Passing alchemy=False to kernels makes sure that the atomic kernel only compares C to C, H to H etc.
# This will speed up kernels of some representations dramatically, but only works in pipelines

# Create model
model = sklearn.pipeline.make_pipeline(
qmlearn.preprocessing.AtomScaler(data),
qmlearn.representations.CoulombMatrix(),
qmlearn.kernels.GaussianKernel(alchemy=False),
qmlearn.models.KernelRidgeRegression(),
)

# Create 1000 random indices
indices = np.arange(1000)
np.random.shuffle(indices)

model.fit(indices[:800])
scores = model.score(indices[800:])
print("Negative MAE without alchemy:", scores)

print("*** End pipelines examples ***")
print()

def cross_validation():
"""
Doing cross validation with qmlearn
"""

print("*** Begin CV examples ***")

# Create data
data = qmlearn.Data("../test/qm7/*.xyz")
energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)
data.set_energies(energies)

# Create model
model = sklearn.pipeline.make_pipeline(
qmlearn.preprocessing.AtomScaler(data),
qmlearn.representations.CoulombMatrix(),
qmlearn.kernels.GaussianKernel(),
qmlearn.models.KernelRidgeRegression(),
# memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch
)

# Create 1000 random indices
indices = np.arange(1000)
np.random.shuffle(indices)

# 3-fold CV of a given model can easily be done
scores = sklearn.model_selection.cross_validate(model, indices, cv=3)
print("Cross-validated scores:", scores['test_score'])

# Doing a grid search over hyper parameters
params = {'gaussiankernel__sigma': [10, 30, 100],
'kernelridgeregression__l2_reg': [1e-8, 1e-4],
}

grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params)
grid.fit(indices)
print("Best hyper parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

# As an alternative the pipeline can be constructed slightly different, which allows more complex CV
# Create model
model = sklearn.pipeline.Pipeline([
('preprocess', qmlearn.preprocessing.AtomScaler(data)),
('representations', qmlearn.representations.CoulombMatrix()),
('kernel', qmlearn.kernels.GaussianKernel()),
('model', qmlearn.models.KernelRidgeRegression())
],
# memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch
)

# Doing a grid search over hyper parameters
# including which kernel to use
params = {'kernel': [qmlearn.kernels.LaplacianKernel(), qmlearn.kernels.GaussianKernel()],
'kernel__sigma': [10, 30, 100, 1000, 3000, 1000],
'model__l2_reg': [1e-8, 1e-4],
}

grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params)
grid.fit(indices)
print("Best hyper parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

print("*** End CV examples ***")

if __name__ == '__main__':
data()
preprocessing()
representations()
kernels()
models()
pipelines()
cross_validation()
2 changes: 2 additions & 0 deletions qml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
from . import arad
from . import fchl
from . import representations
from . import qmlearn
from . import utils

__author__ = "Anders S. Christensen"
__copyright__ = "Copyright 2016"
Expand Down
Loading

0 comments on commit 9143138

Please sign in to comment.