diff --git a/docs/source/qml.rst b/docs/source/qml.rst index 5a89437c7..cf23cb9bb 100644 --- a/docs/source/qml.rst +++ b/docs/source/qml.rst @@ -113,3 +113,14 @@ qml\.aglaia module :inherited-members: +qml\.qmlearn.representations module +------------------ + +.. automodule:: qml.qmlearn.representations + :inherited-members: + +qml\.qmlearn.kernels module +------------------ + +.. automodule:: qml.qmlearn.kernels + :inherited-members: diff --git a/examples/qmlearn.py b/examples/qmlearn.py new file mode 100644 index 000000000..84af9e7be --- /dev/null +++ b/examples/qmlearn.py @@ -0,0 +1,287 @@ +import glob +import numpy as np +from qml import qmlearn +import sklearn.pipeline +import sklearn.model_selection + +def data(): + """ + Using the Data object. + """ + print("*** Begin data examples ***") + + # The Data object has the same role as the Compound class. + # Where the Compound class is for one compound, the Data class + # Is for multiple + + # One can load in a set of xyz files + filenames = sorted(glob.glob("../test/qm7/00*.xyz")) + data = qmlearn.Data(filenames) + print("length of filenames", len(filenames)) + print("length of nuclear_charges", len(data.nuclear_charges)) + print("length of coordinates", len(data.coordinates)) + + # Or just load a glob string + data = qmlearn.Data("../test/qm7/00*.xyz") + print("length of nuclear_charges", len(data.nuclear_charges)) + + # Energies (or other molecular properties) can be stored in the object + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1)[:98] + data.set_energies(energies) + print("length of energies", len(data.energies)) + + print("*** End data examples ***") + print() + +def preprocessing(): + """ + Rescaling energies + """ + + print("*** Begin preprocessing examples ***") + + # The AtomScaler object does a linear fit of the number of each element to the energy. + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + + # Input can be nuclear_charges and energies + print("Energies before rescaling", energies[:3]) + rescaled_energies = qmlearn.preprocessing.AtomScaler().fit_transform(data.nuclear_charges, energies) + print("Energies after rescaling", rescaled_energies[:3]) + + # Or a data object can be used + data.set_energies(energies) + data2 = qmlearn.preprocessing.AtomScaler().fit_transform(data) + print("Energies after rescaling", data2.energies[:3]) + + print("*** End preprocessing examples ***") + print() + +def representations(): + """ + Creating representations. Currently implemented representations are + CoulombMatrix, AtomicCoulombMatrix, AtomicSLATM, GlobalSLATM, + FCHLRepresentations, AtomCenteredSymmetryFunctions. + (BagOfBonds is still missing) + """ + + print("*** Begin representations examples ***") + + data = qmlearn.Data("../test/qm7/*.xyz") + + # Representations can be created from a data object + model = qmlearn.representations.CoulombMatrix(sorting ='row-norm') + representations = model.generate(data) + print("Shape of representations:", representations.shape) + + # Alternatively the data object can be passed at initialization of the representation class + # and only select molecule indices can be parsed + + model = qmlearn.representations.CoulombMatrix(data) + representations = model.generate([0,5,7,16]) + print("Shape of representations:", representations.shape) + + print("*** End representations examples ***") + print() + +def kernels(): + """ + Create kernels. Currently implemented kernels are GaussianKernel, + LaplacianKernel, FCHLKernel. + """ + + print("*** Begin kernels examples ***") + + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Kernels can be created from representations + model = qmlearn.representations.CoulombMatrix(data) + indices = np.arange(100) + representations = model.generate(indices) + + model = qmlearn.kernels.GaussianKernel(sigma='auto') + symmetric_kernels = model.generate(representations[:80]) + print("Shape of symmetric kernels:", symmetric_kernels.shape) + + asymmetric_kernels = model.generate(representations[:80], representations[80:]) + print("Shape of asymmetric kernels:", asymmetric_kernels.shape) + + # Atomic representations can be used as well + model = qmlearn.representations.AtomicCoulombMatrix(data) + indices = np.arange(100) + representations = model.generate(indices) + + model = qmlearn.kernels.GaussianKernel(sigma='auto') + symmetric_kernels = model.generate(representations[:80], representation_type = 'atomic') + print("Shape of symmetric kernels:", symmetric_kernels.shape) + + asymmetric_kernels = model.generate(representations[:80], representations[80:], representation_type = 'atomic') + print("Shape of asymmetric kernels:", asymmetric_kernels.shape) + + print("*** End kernels examples ***") + print() + +def models(): + """ + Regression models. Only KernelRidgeRegression implemented so far. + """ + + print("*** Begin models examples ***") + + filenames = sorted(glob.glob("../test/qm7/*.xyz")) + data = qmlearn.Data(filenames) + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + model = qmlearn.representations.CoulombMatrix(data) + # Create 1000 random indices + indices = np.arange(1000) + np.random.shuffle(indices) + + representations = model.generate(indices) + model = qmlearn.kernels.GaussianKernel(sigma='auto') + symmetric_kernels = model.generate(representations[:800]) + asymmetric_kernels = model.generate(representations[:800], representations[800:]) + + # Model can be fit giving kernel matrix and energies + + model = qmlearn.models.KernelRidgeRegression() + model.fit(symmetric_kernels, energies[indices[:800]]) + print("Fitted KRR weights:", model.alpha[:3]) + + # Predictions can be had from an asymmetric kernel + predictions = model.predict(asymmetric_kernels) + print("Predicted energies:", predictions[:3]) + print("True energies:", energies[indices[:3]]) + + # Or the score (default negative mae) can be had directly + scores = model.score(asymmetric_kernels, energies[indices[800:]]) + print("Negative MAE:", scores) + + print("*** End models examples ***") + print() + +def pipelines(): + """ + Constructing scikit-learn pipelines + """ + + print("*** Begin pipelines examples ***") + + # It is much easier to do all this with a scikit-learn pipeline + + # Create data + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.CoulombMatrix(), + qmlearn.kernels.GaussianKernel(), + qmlearn.models.KernelRidgeRegression(), + ) + + # Create 1000 random indices + indices = np.arange(1000) + np.random.shuffle(indices) + + model.fit(indices[:800]) + scores = model.score(indices[800:]) + print("Negative MAE:", scores) + + # Passing alchemy=False to kernels makes sure that the atomic kernel only compares C to C, H to H etc. + # This will speed up kernels of some representations dramatically, but only works in pipelines + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.CoulombMatrix(), + qmlearn.kernels.GaussianKernel(alchemy=False), + qmlearn.models.KernelRidgeRegression(), + ) + + # Create 1000 random indices + indices = np.arange(1000) + np.random.shuffle(indices) + + model.fit(indices[:800]) + scores = model.score(indices[800:]) + print("Negative MAE without alchemy:", scores) + + print("*** End pipelines examples ***") + print() + +def cross_validation(): + """ + Doing cross validation with qmlearn + """ + + print("*** Begin CV examples ***") + + # Create data + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.CoulombMatrix(), + qmlearn.kernels.GaussianKernel(), + qmlearn.models.KernelRidgeRegression(), + # memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch + ) + + # Create 1000 random indices + indices = np.arange(1000) + np.random.shuffle(indices) + + # 3-fold CV of a given model can easily be done + scores = sklearn.model_selection.cross_validate(model, indices, cv=3) + print("Cross-validated scores:", scores['test_score']) + + # Doing a grid search over hyper parameters + params = {'gaussiankernel__sigma': [10, 30, 100], + 'kernelridgeregression__l2_reg': [1e-8, 1e-4], + } + + grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params) + grid.fit(indices) + print("Best hyper parameters:", grid.best_params_) + print("Best score:", grid.best_score_) + + # As an alternative the pipeline can be constructed slightly different, which allows more complex CV + # Create model + model = sklearn.pipeline.Pipeline([ + ('preprocess', qmlearn.preprocessing.AtomScaler(data)), + ('representations', qmlearn.representations.CoulombMatrix()), + ('kernel', qmlearn.kernels.GaussianKernel()), + ('model', qmlearn.models.KernelRidgeRegression()) + ], + # memory='/dev/shm/' ### This will cache the previous steps to the virtual memory and might speed up gridsearch + ) + + # Doing a grid search over hyper parameters + # including which kernel to use + params = {'kernel': [qmlearn.kernels.LaplacianKernel(), qmlearn.kernels.GaussianKernel()], + 'kernel__sigma': [10, 30, 100, 1000, 3000, 1000], + 'model__l2_reg': [1e-8, 1e-4], + } + + grid = sklearn.model_selection.GridSearchCV(model, cv=3, refit=False, param_grid=params) + grid.fit(indices) + print("Best hyper parameters:", grid.best_params_) + print("Best score:", grid.best_score_) + + print("*** End CV examples ***") + +if __name__ == '__main__': + data() + preprocessing() + representations() + kernels() + models() + pipelines() + cross_validation() diff --git a/qml/__init__.py b/qml/__init__.py index 1d98cf687..0c4d0e346 100644 --- a/qml/__init__.py +++ b/qml/__init__.py @@ -40,6 +40,8 @@ from . import arad from . import fchl from . import representations +from . import qmlearn +from . import utils __author__ = "Anders S. Christensen" __copyright__ = "Copyright 2016" diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index c174ce286..8a7808cfc 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -30,13 +30,13 @@ import tensorflow as tf from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error from sklearn.base import BaseEstimator -from qml.aglaia.symm_funct import generate_parkhill_acsf -from qml.aglaia.utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \ - is_bool, is_positive_integer_or_zero, is_string, is_positive_integer_array, is_array_like, is_none, \ +from .symm_funct import generate_parkhill_acsf +from ..utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \ + is_bool, is_positive_integer_or_zero, is_string, is_positive_integer_array, is_array_like, \ check_global_representation, check_y, check_sizes, check_dy, check_classes, is_numeric_array, is_non_zero_integer, \ is_positive_integer_or_zero_array, check_local_representation -from qml.aglaia.tf_utils import TensorBoardLogger +from .tf_utils import TensorBoardLogger try: from qml.data import Compound @@ -580,7 +580,7 @@ def _set_slatm_parameters(self, params): self.slatm_parameters = {'slatm_sigma1': 0.05, 'slatm_sigma2': 0.05, 'slatm_dgrid1': 0.03, 'slatm_dgrid2': 0.03, 'slatm_rcut': 4.8, 'slatm_rpower': 6, 'slatm_alchemy': False} - if not is_none(params): + if params is not None: for key, value in params.items(): if key in self.slatm_parameters: self.slatm_parameters[key] = value @@ -597,7 +597,7 @@ def _set_acsf_parameters(self, params): self.acsf_parameters = {'radial_cutoff': 10.0, 'angular_cutoff': 10.0, 'radial_rs': (0.0, 0.1, 0.2), 'angular_rs': (0.0, 0.1, 0.2), 'theta_s': (3.0, 2.0), 'zeta': 3.0, 'eta': 2.0} - if not is_none(params): + if params is not None: for key, value in params.items(): if key in self.acsf_parameters: self.acsf_parameters[key] = value @@ -658,7 +658,7 @@ def generate_compounds(self, filenames): """ # Check that the number of properties match the number of compounds if the properties have already been set - if is_none(self.properties): + if self.properties is None: pass else: if self.properties.size == len(filenames): @@ -683,18 +683,18 @@ def generate_representation(self, xyz=None, classes=None): :return: None """ - if is_none(self.compounds) and is_none(xyz) and is_none(classes): + if self.compounds is None and xyz is None and classes is None: raise InputError("QML compounds need to be created in advance or Cartesian coordinates need to be passed in " "order to generate the representation.") - if not is_none(self.representation): + if self.representation is not None: raise InputError("The representations have already been set!") - if is_none(self.compounds): + if self.compounds is None: self.representation, self.classes = self._generate_representations_from_data(xyz, classes) - elif is_none(xyz): + elif xyz is None: # Make representations from compounds self.representation, self.classes = self._generate_representations_from_compounds() @@ -708,7 +708,7 @@ def set_properties(self, properties): :param y: array of properties of size (nsamples,) :type y: array """ - if is_none(properties): + if properties is None: raise InputError("Properties cannot be set to none.") else: if is_numeric_array(properties) and np.asarray(properties).ndim == 1: @@ -725,10 +725,10 @@ def set_representations(self, representations): :type representations: numpy array of shape (n_samples, n_features) or (n_samples, n_atoms, n_features) """ - if not is_none(self.representation): + if self.representation is not None: raise InputError("The representations have already been set!") - if is_none(representations): + if representations is None: raise InputError("Descriptor cannot be set to none.") else: if is_numeric_array(representations): @@ -745,7 +745,7 @@ def set_gradients(self, gradients): :return: None """ - if is_none(gradients): + if gradients is None: raise InputError("Gradients cannot be set to none.") else: if is_numeric_array(gradients): @@ -762,7 +762,7 @@ def set_classes(self, classes): :type classes: numpy array of shape (n_samples, n_atoms) of ints :return: None """ - if is_none(classes): + if classes is None: raise InputError("Classes cannot be set to none.") else: if is_positive_integer_array(classes): @@ -1050,7 +1050,7 @@ def _initialise_representation(self, representation, parameters): raise InputError("Unknown representation %s" % representation) self.representation_name = representation.lower() - if not is_none(parameters): + if parameters is not None: if not type(parameters) is dict: raise InputError("The representation parameters passed should be either None or a dictionary.") @@ -1060,7 +1060,7 @@ def _initialise_representation(self, representation, parameters): else: - if not is_none(parameters): + if parameters is not None: raise InputError("The representation %s does not take any additional parameters." % (self.representation_name)) def _set_representation(self, representation): @@ -1098,7 +1098,7 @@ def _generate_representations_from_compounds(self): :rtype: numpy array of shape (n_samples, n_features) and None """ - if is_none(self.compounds): + if self.compounds is None: raise InputError("This should never happen.") n_samples = len(self.compounds) @@ -1368,18 +1368,18 @@ def _check_inputs(self, x, y, dy, classes): if not is_array_like(x): raise InputError("x should be an array either containing indices or data.") - if not is_none(dy) and not is_none(classes): + if dy is not None and classes is not None: raise InputError("MRMP estimator cannot predict gradients and do atomic decomposition.") # Check if x is made up of indices or data if is_positive_integer_or_zero_array(x): - if is_none(self.representation): - if is_none(self.compounds): + if self.representation is None: + if self.compounds is None: raise InputError("No representations or QML compounds have been set yet.") else: self.representation, _ = self._generate_representations_from_compounds() - if is_none(self.properties): + if self.properties is None: raise InputError("The properties need to be set in advance.") approved_x = self.representation[x] @@ -1391,7 +1391,7 @@ def _check_inputs(self, x, y, dy, classes): else: - if is_none(y): + if y is None: raise InputError("y cannot be of None type.") approved_x = check_global_representation(x) @@ -1420,18 +1420,18 @@ def _check_predict_input(self, x, classes): if not is_array_like(x): raise InputError("x should be an array either containing indices or data.") - if not is_none(classes): + if classes is not None: raise InputError("MRMP estimator cannot do atomic decomposition.") # Check if x is made up of indices or data if is_positive_integer_or_zero_array(x): - if is_none(self.representation): - if is_none(self.compounds): + if self.representation is None: + if self.compounds is None: raise InputError("No representations or QML compounds have been set yet.") else: self.representation, _ = self._generate_representations_from_compounds() - if is_none(self.properties): + if self.properties is None: raise InputError("The properties need to be set in advance.") approved_x = self.representation[x] @@ -1586,7 +1586,7 @@ def _initialise_representation(self, representation, parameters): raise InputError("Unknown representation %s" % representation) self.representation_name = representation.lower() - if not is_none(parameters): + if parameters is not None: if not type(parameters) is dict: raise InputError("The representation parameters passed should be either None or a dictionary.") self._check_representation_parameters(parameters) @@ -1601,7 +1601,7 @@ def _initialise_representation(self, representation, parameters): else: - if not is_none(parameters): + if parameters is not None: raise InputError("The representation %s does not take any additional parameters." % (self.representation_name)) def _set_representation(self, representation): @@ -1624,7 +1624,7 @@ def _generate_representations_from_data(self, xyz, classes): :rtype: numpy arrays of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) """ - if is_none(classes): + if classes is None: raise InputError("The classes need to be provided for the ARMP estimator.") else: if len(classes.shape) > 2 or np.all(xyz.shape[:2] != classes.shape): @@ -1743,7 +1743,7 @@ def _generate_representations_from_compounds(self): :rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) """ - if is_none(self.compounds): + if self.compounds is None: raise InputError("QML compounds needs to be created in advance") if self.representation_name == 'slatm': @@ -2028,22 +2028,22 @@ def _check_inputs(self, x, y, dy, classes): if not is_array_like(x): raise InputError("x should be an array either containing indices or data.") - if not is_none(dy): + if dy is not None: raise InputError("ARMP estimator cannot be used to predict gradients. Use ARMP_G estimator.") # Check if x is made up of indices or data if is_positive_integer_or_zero_array(x): - if is_none(self.representation): + if self.representation is None: - if is_none(self.compounds): + if self.compounds is None: raise InputError("No representations or QML compounds have been set yet.") else: self.representation, self.classes = self._generate_representations_from_compounds() - if is_none(self.properties): + if self.properties is None: raise InputError("The properties need to be set in advance.") - if is_none(self.classes): + if self.classes is None: raise InputError("The classes need to be set in advance.") approved_x = self.representation[x] @@ -2055,9 +2055,9 @@ def _check_inputs(self, x, y, dy, classes): else: - if is_none(y): + if y is None: raise InputError("y cannot be of None type.") - if is_none(classes): + if classes is None: raise InputError("ARMP estimator needs the classes to do atomic decomposition.") approved_x = check_local_representation(x) @@ -2089,12 +2089,12 @@ def _check_predict_input(self, x, classes): # Check if x is made up of indices or data if is_positive_integer_or_zero_array(x): - if is_none(self.representation): - if is_none(self.compounds): + if self.representation is None: + if self.compounds is None: raise InputError("No representations or QML compounds have been set yet.") else: self.representation, self.classes = self._generate_representations_from_compounds() - if is_none(self.properties): + if self.properties is None: raise InputError("The properties need to be set in advance.") approved_x = self.representation[x] @@ -2104,7 +2104,7 @@ def _check_predict_input(self, x, classes): else: - if is_none(classes): + if classes is None: raise InputError("ARMP estimator needs the classes to do atomic decomposition.") approved_x = check_local_representation(x) diff --git a/qml/arad/arad.py b/qml/arad/arad.py index 1d226de6c..df36f9520 100644 --- a/qml/arad/arad.py +++ b/qml/arad/arad.py @@ -33,7 +33,7 @@ from .farad_kernels import fget_atomic_kernels_arad from .farad_kernels import fget_atomic_symmetric_kernels_arad -from qml.data.alchemy import PTP +from qml.utils.alchemy import PTP def getAngle(sp,norms): epsilon = 10.* np.finfo(float).eps diff --git a/qml/data/__init__.py b/qml/data/__init__.py index 7b2ca3566..dcd15565d 100644 --- a/qml/data/__init__.py +++ b/qml/data/__init__.py @@ -23,4 +23,3 @@ from .xyzdataprovider import XYZDataProvider from .compound import Compound -from .alchemy import ELEMENT_NAME, NUCLEAR_CHARGE diff --git a/qml/data/compound.py b/qml/data/compound.py index bdd48c66f..2c95f5348 100644 --- a/qml/data/compound.py +++ b/qml/data/compound.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2016-2017 Anders Steen Christensen, Felix Faber, Lars Andersen Bratholm +# Copyright (c) 2016-2018 Anders Steen Christensen, Felix Faber, Lars Andersen Bratholm # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -25,7 +25,7 @@ import numpy as np import collections -from .alchemy import NUCLEAR_CHARGE +from ..utils import NUCLEAR_CHARGE from ..representations import generate_coulomb_matrix from ..representations import generate_atomic_coulomb_matrix diff --git a/qml/data/dataprovider.py b/qml/data/dataprovider.py index 72b8e0345..62418321f 100644 --- a/qml/data/dataprovider.py +++ b/qml/data/dataprovider.py @@ -44,4 +44,3 @@ def get_properties(self, idx=None): def read_database(self, db_filename): self.compounds = connect(db_filename) - diff --git a/qml/data/xyzdataprovider.py b/qml/data/xyzdataprovider.py index 2ee4adc2b..fd55e5b37 100644 --- a/qml/data/xyzdataprovider.py +++ b/qml/data/xyzdataprovider.py @@ -40,4 +40,3 @@ def add_structures(self, xyz_filenames): print(i, xyz_filename, self.properties[i]) compound = read(xyz_filename) self.compounds.write(compound) - diff --git a/qml/fchl/fchl_electric_field_kernels.py b/qml/fchl/fchl_electric_field_kernels.py index 6ba5ebc04..beb7c7fba 100644 --- a/qml/fchl/fchl_electric_field_kernels.py +++ b/qml/fchl/fchl_electric_field_kernels.py @@ -31,7 +31,7 @@ from .fchl_kernel_functions import get_kernel_parameters -from qml.data.alchemy import get_alchemy +from qml.utils.alchemy import get_alchemy # def get_local_kernels_ef(A, B, verbose=False, df=0.01, ef_scaling=0.01,\ diff --git a/qml/fchl/fchl_force_kernels.py b/qml/fchl/fchl_force_kernels.py index 51a64b0b8..9a5b4cc5f 100644 --- a/qml/fchl/fchl_force_kernels.py +++ b/qml/fchl/fchl_force_kernels.py @@ -37,7 +37,7 @@ from .fchl_kernel_functions import get_kernel_parameters -from qml.data.alchemy import get_alchemy +from qml.utils.alchemy import get_alchemy def get_gaussian_process_kernels(A, B, verbose=False, dx=0.005, \ diff --git a/qml/fchl/fchl_kernels.py b/qml/fchl/fchl_kernels.py index bd3bb2c2d..feb450ef0 100644 --- a/qml/fchl/fchl_kernels.py +++ b/qml/fchl/fchl_kernels.py @@ -52,7 +52,7 @@ from .fchl_kernel_functions import get_kernel_parameters -from qml.data.alchemy import get_alchemy +from qml.utils.alchemy import get_alchemy def get_local_kernels(A, B, \ diff --git a/qml/fchl/fchl_representations.py b/qml/fchl/fchl_representations.py index 60f8123a7..14d6183da 100644 --- a/qml/fchl/fchl_representations.py +++ b/qml/fchl/fchl_representations.py @@ -25,8 +25,8 @@ import numpy as np import copy -from qml.data.alchemy import get_alchemy -from qml.data.alchemy import ELEMENT_NAME +from qml.utils.alchemy import get_alchemy +from qml.utils import ELEMENT_NAME def generate_representation(coordinates, nuclear_charges, max_size=23, neighbors=23, cut_distance = 5.0, cell=None): diff --git a/qml/fchl/fchl_scalar_kernels.py b/qml/fchl/fchl_scalar_kernels.py index 52f6733bc..2418d88c9 100644 --- a/qml/fchl/fchl_scalar_kernels.py +++ b/qml/fchl/fchl_scalar_kernels.py @@ -36,7 +36,7 @@ from .ffchl_module import fget_atomic_local_kernels_fchl from .fchl_kernel_functions import get_kernel_parameters -from qml.data.alchemy import get_alchemy +from qml.utils.alchemy import get_alchemy def get_local_kernels(A, B, verbose=False,\ diff --git a/qml/kernels/kernels.py b/qml/kernels/kernels.py index a13c3389b..81f53f9e0 100644 --- a/qml/kernels/kernels.py +++ b/qml/kernels/kernels.py @@ -24,7 +24,7 @@ import numpy as np -from .fkernels import fgaussian_kernel +from .fkernels import fgaussian_kernel, fgaussian_kernel_symmetric from .fkernels import flaplacian_kernel from .fkernels import fgaussian_kernel_symmetric from .fkernels import flaplacian_kernel_symmetric @@ -34,6 +34,7 @@ from .fkernels import fget_local_kernels_gaussian from .fkernels import fget_local_kernels_laplacian +from .fkernels import fget_vector_kernels_gaussian, fget_vector_kernels_gaussian_symmetric def laplacian_kernel(A, B, sigma): """ Calculates the Laplacian kernel matrix K, where :math:`K_{ij}`: @@ -304,7 +305,7 @@ def get_local_kernels_gaussian(A, B, na, nb, sigmas): nma = len(na) nmb = len(nb) - + sigmas = np.asarray(sigmas) nsigmas = len(sigmas) diff --git a/qml/models/kernelridge.py b/qml/models/kernelridge.py index 47fab8c79..32f61cb54 100644 --- a/qml/models/kernelridge.py +++ b/qml/models/kernelridge.py @@ -110,5 +110,3 @@ def _save(self, path, save_kernel=False): if save_kernel: np.save(path + "/K.npy") - - diff --git a/qml/qmlearn/__init__.py b/qml/qmlearn/__init__.py new file mode 100644 index 000000000..d747a4c37 --- /dev/null +++ b/qml/qmlearn/__init__.py @@ -0,0 +1,27 @@ +# MIT License +# +# Copyright (c) 2018 Lars Andersen Bratholm +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from .data import Data +from . import representations +from . import kernels +from . import preprocessing +from . import models diff --git a/qml/qmlearn/data.py b/qml/qmlearn/data.py new file mode 100644 index 000000000..3890e2641 --- /dev/null +++ b/qml/qmlearn/data.py @@ -0,0 +1,139 @@ + +from __future__ import print_function + +import glob +import numpy as np +from ..utils import NUCLEAR_CHARGE +import copy + + +class Data(object): + """ + Temporary data class which should be replaced at some point by the ASE-interface. + This could in principle also be replaced by a dictionary + + """ + + def __init__(self, filenames=None, property_type = "energy"): + """ + :param filenames: list of filenames or a string to be read by glob. e.g. 'dir/*.xyz' + :type filenames: list or string + :param property_type: What kind of property will be predicted ('energy') + :type property_type: string + """ + + self.property_type = property_type + + self._set_ncompounds(0) + self.coordinates = None + self.nuclear_charges = None + self.natoms = None + self.energies = None + + if isinstance(filenames, str): + filenames = sorted(glob.glob(filenames)) + if isinstance(filenames, list): + self._parse_xyz_files(filenames) + # Overwritten in various parts of a standard prediction pipeline + # so don't use these within the class + #self._has_transformed_labels + #self._representations + #self._kernel + #self._indices + #self._representation_type + #self._representation_short_name + #self._representation_cutoff + #self._representation_alchemy + + def _set_ncompounds(self, n): + self.ncompounds = n + # Hack for sklearn CV + self.shape = (n,) + + def take(self, i, axis=None): + """ + Hack for sklearn CV + """ + other = copy.copy(self) + other._indices = i + return other + + # Hack for sklearn CV + def __getitem__(self, i): + return i + + # Hack for sklearn CV but also convenience + def __len__(self): + if hasattr(self, '_indices'): + return len(self._indices) + return self.ncompounds + + # Hack for sklearn CV but also convenience + def __eq__(self, other): + """ + Overrides the == operator. + """ + + if type(self) != type(other): + return False + + self_vars = vars(self) + other_vars = vars(other) + + if len(self_vars) != len(other_vars): + return False + + for key, val in self_vars.items(): + if val is not other_vars[key]: + return False + + return True + + # Hack for sklearn CV but also convenience + def __ne__(self, other): + """ + Overrides the != operator (unnecessary in Python 3) + """ + return not self.__eq__(other) + + def set_energies(self, energies): + self.energies = energies + + def _parse_xyz_files(self, filenames): + """ + Parse a list of xyz files. + """ + + self._set_ncompounds(len(filenames)) + self.coordinates = np.empty(self.ncompounds, dtype=object) + self.nuclear_charges = np.empty(self.ncompounds, dtype=object) + self.natoms = np.empty(self.ncompounds, dtype = int) + + for i, filename in enumerate(filenames): + with open(filename, "r") as f: + lines = f.readlines() + + natoms = int(lines[0]) + self.natoms[i] = natoms + self.nuclear_charges[i] = np.empty(natoms, dtype=int) + self.coordinates[i] = np.empty((natoms, 3), dtype=float) + + for j, line in enumerate(lines[2:natoms+2]): + tokens = line.split() + + if len(tokens) < 4: + break + + self.nuclear_charges[i][j] = NUCLEAR_CHARGE[tokens[0]] + self.coordinates[i][j] = np.asarray(tokens[1:4], dtype=float) + + # Try to convert dtype to int/float in cases where you have the + # same molecule, just different conformers + + try: + self.nuclear_charges = np.asarray([self.nuclear_charges[i] for i in range(self.ncompounds)], + dtype=int) + self.coordinates = np.asarray([self.coordinates[i] for i in range(self.ncompounds)], + dtype=float) + except ValueError: + pass diff --git a/qml/qmlearn/kernels.py b/qml/qmlearn/kernels.py new file mode 100644 index 000000000..a1157af2d --- /dev/null +++ b/qml/qmlearn/kernels.py @@ -0,0 +1,845 @@ +# MIT License +# +# Copyright (c) 2018 Lars Andersen Bratholm +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +import numpy as np + +from sklearn.base import BaseEstimator + +from .data import Data + +from ..kernels.fkernels import fgaussian_kernel +from ..kernels.fkernels import fgaussian_kernel_symmetric +from ..kernels.fkernels import fget_vector_kernels_gaussian +from ..kernels.fkernels import fget_vector_kernels_gaussian_symmetric +from ..kernels.fkernels import flaplacian_kernel +from ..kernels.fkernels import flaplacian_kernel_symmetric +from ..kernels.fkernels import fget_vector_kernels_laplacian +from ..kernels.fkernels import fget_vector_kernels_laplacian_symmetric + +from ..fchl.ffchl_module import fget_kernels_fchl +from ..fchl.ffchl_module import fget_symmetric_kernels_fchl +from ..fchl.ffchl_module import fget_global_kernels_fchl +from ..fchl.ffchl_module import fget_global_symmetric_kernels_fchl + +from ..utils.alchemy import get_alchemy +from ..utils import get_unique + +class _BaseKernel(BaseEstimator): + """ + Base class for kernels + """ + + def fit(self, X): + """ + The fit routine is needed for scikit-learn pipelines. + This is actually never used but has to be here. + """ + raise NotImplementedError + + def transform(self, X, y = None): + """ + The transform routine is needed for scikit-learn pipelines. + """ + raise NotImplementedError + + def generate(self, **params): + raise NotImplementedError + + def _check_data_object(self, X): + # Check that we have a data object + if not isinstance(X, Data): + print("Error: Expected Data object as input in %s" % self.__class__.__name__) + raise SystemExit + + if not hasattr(X, '_representation_short_name'): + print("Error: No representations found in Data object") + raise SystemExit + + if X._representation_short_name == 'fchl' and self.__class__.__name__ != 'FCHLKernel' or \ + X._representation_short_name != 'fchl' and self.__class__.__name__ == 'FCHLKernel': + print("Error: The FCHL representation is only compatible with the FCHL kernel") + raise SystemExit + + def _set_representations(self, rep): + self.representations = rep + + def _set_nuclear_charges(self, charge): + self.nuclear_charges = charge + + def transform(self, X): + + self._check_data_object(X) + + # Kernel between representation stored in fit and representation + # given in data object. The order matters + if X._representation_type is None: + # For FCHL to keep the documentation tidy, since self.local overrides + # X._representation_type + kernel = self.generate(X._representations, self.representations) + elif X._representation_type == 'molecular': + # Ignore constant features + constant_features = (np.std(X._representations, axis=0) == 0) * self._constant_features * \ + (X._representations[0] == self.representations[0]) + kernel = self.generate(X._representations[:,~constant_features], self.representations[:,~constant_features], 'molecular') + elif (self.alchemy == 'auto' and X._representation_alchemy) or self.alchemy: + # Ignore constant features + flat_representations = np.asarray([item for sublist in X._representations for item in sublist]) + constant_features = (np.std(flat_representations, axis=0) == 0) * self._constant_features * \ + (X._representations[0][0] == self.representations[0][0]) + X_representations = [np.asarray([rep[~constant_features] for rep in mol]) for mol in X._representations] + self_representations = [np.asarray([rep[~constant_features] for rep in mol]) for mol in self.representations] + kernel = self.generate(X_representations, self_representations, 'atomic') + else: + # Find elements used in representations from X + elements1 = get_unique(X.nuclear_charges[X._indices]) + # Get the elements common to both X and the fitted representations + # stored in self + elements = list(set(elements1).intersection(self.elements)) + ## Get list in the form [H_representations, C_representations, ...] + # And ignore constant features + rep1, rep2 = self._get_elementwise_representations_transform(X._representations, X.nuclear_charges[X._indices], elements) + # Sum elementwise contributions to the kernel + kernel = np.zeros((len(X._representations), len(self.representations))) + for i in range(len(rep1)): + kernel += self.generate(rep1[i], rep2[i], representation_type='atomic') + + X._kernel = kernel + + return X + + def _get_elementwise_representations_transform(self, representations, nuclear_charges, elements): + """ + Create a set of lists where the each item only contain representations of a specific element + """ + + # Ignore constant features + flat_nuclear_charges1 = np.asarray([item for sublist in nuclear_charges for item in sublist], dtype=int) + flat_representations1 = np.asarray([item for sublist in representations for item in sublist]) + flat_nuclear_charges2 = np.asarray([item for sublist in self.nuclear_charges for item in sublist], dtype=int) + flat_representations2 = np.asarray([item for sublist in self.representations for item in sublist]) + + constant_features = {element: None for element in elements} + + for ele in elements: + rep1 = flat_representations1[flat_nuclear_charges1 == ele] + rep2 = flat_representations2[flat_nuclear_charges2 == ele] + rep = np.concatenate([rep1, rep2]) + constant_features[ele] = (np.std(rep, axis=0) == 0) + + # Create the representations + elementwise_representations1 = [[np.atleast_2d( + [v[~constant_features[element]] for j,v in enumerate(representations[i]) + if nuclear_charges[i][j] == element]) for i in range(len(nuclear_charges))] + for element in elements] + + elementwise_representations2 = [[np.atleast_2d( + [v[~constant_features[element]] for j,v in enumerate(self.representations[i]) + if self.nuclear_charges[i][j] == element]) for i in range(len(self.nuclear_charges))] + for element in elements] + + return elementwise_representations1, elementwise_representations2 + + def _get_elementwise_representations_fit(self, representations, nuclear_charges): + """ + Create a list where the each item only contain representations of a specific element + """ + elements = get_unique(nuclear_charges) + self.elements = elements + + # Ignore constant features + flat_nuclear_charges = np.asarray([item for sublist in nuclear_charges for item in sublist], dtype=int) + flat_representations = np.asarray([item for sublist in representations for item in sublist]) + + constant_features = {element: None for element in elements} + + for ele in elements: + rep = flat_representations[flat_nuclear_charges == ele] + constant_features[ele] = (np.std(rep, axis=0) == 0) + + # Create the representations + elementwise_representations = [[np.atleast_2d( + [v[~constant_features[element]] for j,v in enumerate(representations[i]) + if nuclear_charges[i][j] == element]) for i in range(len(nuclear_charges))] + for element in elements] + + return elementwise_representations + + def _fit_transform(self, X): + + self._check_data_object(X) + + # Store representation / nuclear_charges for future transform calls + self._set_representations(X._representations) + self._set_nuclear_charges(X.nuclear_charges[X._indices]) + + if X._representation_type is None: + # For FCHL to keep the documentation tidy, since self.local overrides + # X._representation_type + kernel = self.generate(X._representations) + elif X._representation_type == 'molecular': + # Ignore constant features + self._constant_features = (np.std(X._representations, axis=0) == 0) + kernel = self.generate(X._representations[:,~self._constant_features], representation_type='molecular') + elif (self.alchemy == 'auto' and X._representation_alchemy) or self.alchemy: + # Ignore constant features + flat_representations = np.asarray([item for sublist in X._representations for item in sublist]) + self._constant_features = (np.std(flat_representations, axis=0) == 0) + representations = [np.asarray([rep[~self._constant_features] for rep in mol]) for mol in X._representations] + kernel = self.generate(representations, representation_type='atomic') + else: + # Get list in the form [H_representations, C_representations, ...] + # And ignore constant features + rep = self._get_elementwise_representations_fit(X._representations, X.nuclear_charges[X._indices]) + # Sum elementwise contributions to the kernel + kernel = np.zeros((len(X._representations),)*2) + for r in rep: + kernel += self.generate(r, representation_type='atomic') + + # Store kernel + X._kernel = kernel + + return X + + def fit_transform(self, X, y=None): + return self._fit_transform(X) + +class GaussianKernel(_BaseKernel): + """ + Gaussian kernel + """ + + def __init__(self, sigma='auto', alchemy='auto'): + """ + :param sigma: Scale parameter of the gaussian kernel. `sigma='auto'` will try to guess + a good value (very approximate). + :type sigma: float + :param alchemy: Determines if contributions from atoms of differing elements should be + included in the kernel. If alchemy='auto', this will be automatically + determined from the representation used + :type alchemy: bool + """ + self.sigma = sigma + self.alchemy = alchemy + + self.representations = None + self._elements = None + self._constant_features = None + + def _quick_estimate_sigma(self, X, representation_type, sigma_init=100, count=1): + """ + Use 50 random points for atomic, 200 for molecular, to get an approximate guess for sigma + """ + + if count > 10: + print("Error. Could not automatically determine parameter `sigma` in the kernel %s" + % self.__class__.__name__) + raise SystemExit + + if representation_type == 'molecular': + n = 200 + else: + n = 50 + + if len(X) < n: + n = len(X) + + self.sigma = sigma_init + + # Generate kernel for a random subset + indices = np.random.choice(np.arange(len(X)), size=n, replace=False) + kernel = self.generate([X[i] for i in indices], representation_type=representation_type) + + if representation_type == 'molecular': + # min smallest kernel element + smallest_kernel_element = np.min(kernel) + + # Rescale sigma such that smallest kernel element will be equal to 1/2 + self.sigma *= np.sqrt(-np.log(smallest_kernel_element) / np.log(2)) + # Update sigma if the given sigma was completely wrong + if np.isinf(self.sigma): + self._quick_estimate_sigma(X, representation_type, sigma_init * 10, count + 1) + elif self.sigma <= 1e-12: + self._quick_estimate_sigma(X, representation_type, sigma_init / 10, count + 1) + else: + sizes = np.asarray([len(X[i]) for i in indices]) + kernel /= sizes[:,None] * sizes[None,:] + smallest_kernel_element = np.min(kernel) + + if smallest_kernel_element < 0.5: + self._quick_estimate_sigma(X, representation_type, sigma_init * 2.5, count + 1) + elif smallest_kernel_element > 0.95: + self._quick_estimate_sigma(X, representation_type, sigma_init / 2.5, count + 1) + + def generate(self, X, Y=None, representation_type='molecular'): + """ + Create a gaussian kernel from representations `X`. Optionally + an asymmetric kernel can be constructed between representations + `X` and `Y`. + If `representation_type=='molecular` it is assumed that the representations + are molecular and of shape (n_samples, representation_size). + If `representation_type=='atomic` the representations are assumed to be atomic + and the kernel will be computed as an atomic decomposition. + + :param X: representations + :type X: array + :param Y: (Optional) representations + :type Y: array + + :return: Gaussian kernel matrix of shape (n_samplesX, n_samplesX) if \ + Y=None else (n_samplesX, n_samplesY) + :rtype: array + """ + + if self.sigma == 'auto': + if representation_type == 'molecular': + auto_sigma = True + else: + auto_sigma = False + + # Do a quick and dirty initial estimate of sigma + self._quick_estimate_sigma(X, representation_type) + else: + auto_sigma = False + + + if representation_type == 'molecular': + kernel = self._generate_molecular(X,Y) + elif representation_type == 'atomic': + kernel = self._generate_atomic(X,Y) + else: + # Should never get here for users + print("Error: representation_type needs to be 'molecular' or 'atomic'. Got %s" % representation_type) + raise SystemExit + + if auto_sigma: + # Find smallest kernel element + smallest_kernel_element = np.min(kernel) + largest_kernel_element = np.max(kernel) + # Rescale kernel such that we don't have to calculate it again for a new sigma + alpha = - np.log(2) / np.log(smallest_kernel_element/largest_kernel_element) + self.sigma /= np.sqrt(alpha) + return kernel ** alpha + + return kernel + + def _generate_molecular(self, X, Y=None): + + X = np.asarray(X) + + # Note: Transposed for Fortran + n = X.shape[0] + + if Y is None or X is Y: + # Do symmetric matrix + K = np.empty((n, n), order='F') + fgaussian_kernel_symmetric(X.T, n, K, self.sigma) + else: + Y = np.asarray(Y) + # Do asymmetric matrix + m = Y.shape[0] + K = np.empty((n, m), order='F') + fgaussian_kernel(X.T, n, Y.T, m, K, self.sigma) + + return K + + def _generate_atomic(self, X, Y=None): + + n1 = np.array([len(x) for x in X], dtype=np.int32) + + max1 = np.max(n1) + + nm1 = n1.size + + for i in range(len(X)): + rep_size = X[0].shape[1] + if rep_size == 0: + continue + break + else: + if Y is None: + return np.zeros((nm1, nm1)) + return np.zeros((nm1, len(Y))) + + x1 = np.zeros((nm1, max1, rep_size), dtype=np.float64, order="F") + + for i in range(nm1): + if X[i].shape[1] == 0: + n1[i] = 0 + continue + x1[i,:n1[i]] = X[i] + + + # Reorder for Fortran speed + x1 = np.swapaxes(x1, 0, 2) + + sigmas = np.array([self.sigma], dtype=np.float64) + nsigmas = sigmas.size + + if Y is None or X is Y: + # Do symmetric matrix + return fget_vector_kernels_gaussian_symmetric(x1, n1, [self.sigma], + nm1, nsigmas)[0] + else: + # Do asymmetric matrix + n2 = np.array([len(y) for y in Y], dtype=np.int32) + max2 = np.max(n2) + nm2 = n2.size + x2 = np.zeros((nm2, max2, rep_size), dtype=np.float64, order="F") + for i in range(nm2): + if Y[i].shape[1] == 0: + n2[i] = 0 + continue + x2[i,:n2[i]] = Y[i] + x2 = np.swapaxes(x2, 0, 2) + return fget_vector_kernels_gaussian(x1, x2, n1, n2, [self.sigma], + nm1, nm2, nsigmas)[0] + +class LaplacianKernel(_BaseKernel): + """ + Laplacian kernel + """ + + def __init__(self, sigma='auto', alchemy='auto'): + """ + :param sigma: Scale parameter of the gaussian kernel. `sigma='auto'` will try to guess + a good value (very approximate). + :type sigma: float or string + :param alchemy: Determines if contributions from atoms of differing elements should be + included in the kernel. + :type alchemy: bool + """ + self.sigma = sigma + self.alchemy = alchemy + + self.representations = None + self._elements = None + self._constant_features = None + + def _quick_estimate_sigma(self, X, representation_type, sigma_init=100, count=1): + """ + Use 50 random points for atomic, 200 for molecular, to get an approximate guess for sigma + """ + + if count > 10: + print("Error. Could not automatically determine parameter `sigma` in the kernel %s" + % self.__class__.__name__) + raise SystemExit + + if representation_type == 'molecular': + n = 200 + else: + n = 50 + + if len(X) < n: + n = len(X) + + self.sigma = sigma_init + + # Generate kernel for a random subset + indices = np.random.choice(np.arange(len(X)), size=n, replace=False) + kernel = self.generate([X[i] for i in indices], representation_type=representation_type) + + if representation_type == 'molecular': + # min smallest kernel element + smallest_kernel_element = np.min(kernel) + + # Rescale sigma such that smallest kernel element will be equal to 1/2 + self.sigma *= - np.log(smallest_kernel_element) / np.log(2) + # Update sigma if the given sigma was completely wrong + if np.isinf(self.sigma): + self._quick_estimate_sigma(X, representation_type, sigma_init * 10, count + 1) + elif self.sigma <= 1e-12: + self._quick_estimate_sigma(X, representation_type, sigma_init / 10, count + 1) + else: + sizes = np.asarray([len(X[i]) for i in indices]) + kernel /= sizes[:,None] * sizes[None,:] + smallest_kernel_element = np.min(kernel) + + if smallest_kernel_element < 0.5: + self._quick_estimate_sigma(X, representation_type, sigma_init * 2.5, count + 1) + elif smallest_kernel_element > 1: + self._quick_estimate_sigma(X, representation_type, sigma_init / 2.5, count + 1) + + def generate(self, X, Y=None, representation_type='molecular'): + """ + Create a gaussian kernel from representations `X`. Optionally + an asymmetric kernel can be constructed between representations + `X` and `Y`. + If `representation_type=='molecular` it is assumed that the representations + are molecular and of shape (n_samples, representation_size). + If `representation_type=='atomic` the representations are assumed to be atomic + and the kernel will be computed as an atomic decomposition. + + :param X: representations + :type X: array + :param Y: (Optional) representations + :type Y: array + + :return: Gaussian kernel matrix of shape (n_samplesX, n_samplesX) if \ + Y=None else (n_samplesX, n_samplesY) + :rtype: array + """ + + if self.sigma == 'auto': + if representation_type == 'molecular': + auto_sigma = True + else: + auto_sigma = False + + # Do a quick and dirty initial estimate of sigma + self._quick_estimate_sigma(X, representation_type) + else: + auto_sigma = False + + + if representation_type == 'molecular': + kernel = self._generate_molecular(X,Y) + elif representation_type == 'atomic': + kernel = self._generate_atomic(X,Y) + else: + # Should never get here for users + print("Error: representation_type needs to be 'molecular' or 'atomic'. Got %s" % representation_type) + raise SystemExit + + if auto_sigma: + # Find smallest kernel element + smallest_kernel_element = np.min(kernel) + largest_kernel_element = np.max(kernel) + # Rescale kernel such that we don't have to calculate it again for a new sigma + alpha = - np.log(2) / np.log(smallest_kernel_element/largest_kernel_element) + self.sigma /= alpha + return kernel ** alpha + + return kernel + + def _generate_molecular(self, X, Y=None): + + X = np.asarray(X) + + # Note: Transposed for Fortran + n = X.shape[0] + + if Y is None or X is Y: + # Do symmetric matrix + K = np.empty((n, n), order='F') + flaplacian_kernel_symmetric(X.T, n, K, self.sigma) + else: + Y = np.asarray(Y) + # Do asymmetric matrix + m = Y.shape[0] + K = np.empty((n, m), order='F') + flaplacian_kernel(X.T, n, Y.T, m, K, self.sigma) + + return K + + def _generate_atomic(self, X, Y=None): + + n1 = np.array([len(x) for x in X], dtype=np.int32) + + max1 = np.max(n1) + + nm1 = n1.size + + for i in range(len(X)): + rep_size = X[0].shape[1] + if rep_size == 0: + continue + break + else: + if Y is None: + return np.zeros((nm1, nm1)) + return np.zeros((nm1, len(Y))) + + x1 = np.zeros((nm1, max1, rep_size), dtype=np.float64, order="F") + + for i in range(nm1): + if X[i].shape[1] == 0: + n1[i] = 0 + continue + x1[i,:n1[i]] = X[i] + + #for i,j in zip(x1[:,:,2], n1): + # print(i,j) + + + # Reorder for Fortran speed + x1 = np.swapaxes(x1, 0, 2) + + sigmas = np.array([self.sigma], dtype=np.float64) + nsigmas = sigmas.size + + if Y is None or X is Y: + # Do symmetric matrix + return fget_vector_kernels_laplacian_symmetric(x1, n1, [self.sigma], + nm1, nsigmas)[0] + else: + # Do asymmetric matrix + n2 = np.array([len(y) for y in Y], dtype=np.int32) + max2 = np.max(n2) + nm2 = n2.size + x2 = np.zeros((nm2, max2, rep_size), dtype=np.float64, order="F") + for i in range(nm2): + if Y[i].shape[1] == 0: + n2[i] = 0 + continue + x2[i,:n2[i]] = Y[i] + x2 = np.swapaxes(x2, 0, 2) + return fget_vector_kernels_laplacian(x1, x2, n1, n2, [self.sigma], + nm1, nm2, nsigmas)[0] + +class FCHLKernel(_BaseKernel): + """ + FCHL kernel + """ + + def __init__(self, sigma='auto', alchemy=True, two_body_scaling=np.sqrt(8), + three_body_scaling=1.6, two_body_width=0.2, three_body_width=np.pi, + two_body_power=4.0, three_body_power=2.0, damping_start=1.0, cutoff=5.0, + fourier_order=1, alchemy_period_width=1.6, alchemy_group_width=1.6, + local=True): + """ + :param sigma: Scale parameter of the gaussian basis functions. `sigma='auto'` will try to guess + a good value (very approximate). + :type sigma: float or string + :param alchemy: Determines if contributions from atoms of differing elements should be + included in the kernel. + :type alchemy: bool + :param two_body_scaling: Weight for 2-body terms. + :type two_body_scaling: float + :param three_body_scaling: Weight for 3-body terms. + :type three_body_scaling: float + :param two_body_width: Gaussian width for 2-body terms + :type two_body_width: float + :param three_body_width: Gaussian width for 3-body terms. + :type three_body_width: float + :param two_body_power: Powerlaw for :math:`r^{-n}` 2-body terms. + :type two_body_power: float + :param three_body_power: Powerlaw for Axilrod-Teller-Muto 3-body term + :type three_body_power: float + :param damping_start: The fraction of the cutoff radius at which cut-off damping start. + :type damping_start: float + :param cutoff: Cut-off radius. + :type cutoff: float + :param fourier_order: 3-body Fourier-expansion truncation order. + :type fourier_order: integer + :param alchemy_period_width: Gaussian width along periods (columns) in the periodic table. + :type alchemy_period_width: float + :param alchemy_group_width: Gaussian width along groups (rows) in the periodic table. + :type alchemy_group_width: float + :param local: Do local decomposition + :param local: bool + """ + self.sigma = sigma + self.alchemy = alchemy + self.two_body_scaling = two_body_scaling + self.three_body_scaling = three_body_scaling + self.two_body_width = two_body_width + self.three_body_width = three_body_width + self.two_body_power = two_body_power + self.three_body_power = three_body_power + self.damping_start = damping_start + self.cutoff = cutoff + self.fourier_order = fourier_order + self.alchemy_period_width = alchemy_period_width + self.alchemy_group_width = alchemy_group_width + self.local = local + + self.representations = None + + def _quick_estimate_sigma(self, X, sigma_init=1, count=1): + """ + Use 50 random points for atomic, 200 for molecular, to get an approximate guess for sigma + """ + + if count > 10: + print("Error. Could not automatically determine parameter `sigma` in the kernel %s" + % self.__class__.__name__) + raise SystemExit + + n = 50 + + if len(X) < n: + n = len(X) + + self.sigma = sigma_init + + # Generate kernel for a random subset + indices = np.random.choice(np.arange(len(X)), size=n, replace=False) + kernel = self.generate(X)#[indices]) + + if not self.local: + # min smallest kernel element + smallest_kernel_element = np.min(kernel) + if smallest_kernel_element < 0.3: + self._quick_estimate_sigma(X, sigma_init * 2.5, count + 1) + elif smallest_kernel_element > 0.9: + self._quick_estimate_sigma(X, sigma_init / 2.5, count + 1) + + else: + sizes = np.asarray([len(X[i]) for i in indices]) + kernel /= sizes[:,None] * sizes[None,:] + smallest_kernel_element = np.min(kernel) + + if smallest_kernel_element < 0.1: + self._quick_estimate_sigma(X, sigma_init * 2.5, count + 1) + elif smallest_kernel_element > 0.3: + self._quick_estimate_sigma(X, sigma_init / 2.5, count + 1) + + def generate(self, X, Y=None): + """ + Create a kernel from representations `X`. Optionally + an asymmetric kernel can be constructed between representations + `X` and `Y`. + + :param X: representations + :type X: array + :param Y: (Optional) representations + :type Y: array + + :return: Gaussian kernel matrix of shape (n_samplesX, n_samplesX) if \ + Y=None else (n_samplesX, n_samplesY) + :rtype: array + """ + + if self.sigma == 'auto': + # Do a quick and dirty initial estimate of sigma + self._quick_estimate_sigma(X) + + + if not self.local: + kernel = self._generate_molecular(X,Y) + else: + kernel = self._generate_atomic(X,Y) + + return kernel + + def _generate_molecular(self, X, Y=None): + + atoms_max = X.shape[1] + neighbors_max = X.shape[3] + nm1 = X.shape[0] + N1 = np.zeros((nm1),dtype=np.int32) + + for a in range(nm1): + N1[a] = len(np.where(X[a,:,1,0] > 0.0001)[0]) + + neighbors1 = np.zeros((nm1, atoms_max), dtype=np.int32) + + for a, representation in enumerate(X): + ni = N1[a] + for i, x in enumerate(representation[:ni]): + neighbors1[a,i] = len(np.where(x[0] < self.cutoff)[0]) + + if self.alchemy: + alchemy = 'periodic-table' + else: + alchemy = 'off' + + doalchemy, pd = get_alchemy(alchemy, emax=100, r_width=self.alchemy_group_width, c_width=self.alchemy_period_width) + + if Y is None or X is Y: + # Do symmetric kernel + return fget_global_symmetric_kernels_fchl(X, N1, neighbors1, [self.sigma], + nm1, 1, self.three_body_width, self.two_body_width, self.damping_start, + self.cutoff, self.fourier_order, pd, self.two_body_scaling, self.three_body_scaling, + doalchemy, self.two_body_power, self.three_body_power)[0] + else: + # Do asymmetric kernel + nm2 = Y.shape[0] + N2 = np.zeros((nm2),dtype=np.int32) + + for a in range(nm2): + N2[a] = len(np.where(Y[a,:,1,0] > 0.0001)[0]) + + neighbors2 = np.zeros((nm2, atoms_max), dtype=np.int32) + + for a, representation in enumerate(Y): + ni = N2[a] + for i, x in enumerate(representation[:ni]): + neighbors2[a,i] = len(np.where(x[0] < self.cutoff)[0]) + + return fget_global_kernels_fchl(X, Y, N1, N2, neighbors1, neighbors2, [self.sigma], + nm1, nm2, 1, self.three_body_width, self.two_body_width, self.damping_start, + self.cutoff, self.fourier_order, pd, self.two_body_scaling, self.three_body_scaling, + doalchemy, self.two_body_power, self.three_body_power)[0] + + def _generate_atomic(self, X, Y=None): + + atoms_max = X.shape[1] + neighbors_max = X.shape[3] + + nm1 = X.shape[0] + N1 = np.zeros((nm1),dtype=np.int32) + + for a in range(nm1): + N1[a] = len(np.where(X[a,:,1,0] > 0.0001)[0]) + + neighbors1 = np.zeros((nm1, atoms_max), dtype=np.int32) + + for a, representation in enumerate(X): + ni = N1[a] + for i, x in enumerate(representation[:ni]): + neighbors1[a,i] = len(np.where(x[0] < self.cutoff)[0]) + + if self.alchemy: + alchemy = 'periodic-table' + else: + alchemy = 'off' + + doalchemy, pd = get_alchemy(alchemy, emax=100, r_width=self.alchemy_group_width, c_width=self.alchemy_period_width) + + if Y is None or X is Y: + return fget_symmetric_kernels_fchl(X, N1, neighbors1, [self.sigma], + nm1, 1, self.three_body_width, self.two_body_width, self.damping_start, + self.cutoff, self.fourier_order, pd, self.two_body_scaling, self.three_body_scaling, + doalchemy, self.two_body_power, self.three_body_power)[0] + else: + nm2 = Y.shape[0] + N2 = np.zeros((nm2),dtype=np.int32) + + for a in range(nm2): + N2[a] = len(np.where(Y[a,:,1,0] > 0.0001)[0]) + + neighbors2 = np.zeros((nm2, atoms_max), dtype=np.int32) + + for a, representation in enumerate(Y): + ni = N2[a] + for i, x in enumerate(representation[:ni]): + neighbors2[a,i] = len(np.where(x[0] < self.cutoff)[0]) + + return fget_kernels_fchl(X, Y, N1, N2, neighbors1, neighbors2, [self.sigma], + nm1, nm2, 1, self.three_body_width, self.two_body_width, self.damping_start, + self.cutoff, self.fourier_order, pd, self.two_body_scaling, self.three_body_scaling, + doalchemy, self.two_body_power, self.three_body_power)[0] + + def fit_transform(self, X, y=None): + + # Check that the cutoff is less than the cutoff used to make the representation + if self.cutoff > X._representation_cutoff: + print("Error: Cutoff used in the FCHL kernel (%.2f) must be lower or equal" % self.cutoff, + "to the cutoff used in the FCHL representation (%.2f)" % X._representation_cutoff) + raise SystemExit + + return self._fit_transform(X) + diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py new file mode 100644 index 000000000..f53c02d1a --- /dev/null +++ b/qml/qmlearn/models.py @@ -0,0 +1,161 @@ +# MIT License +# +# Copyright (c) 2018 Lars A. Bratholm +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import division, absolute_import, print_function + +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.metrics import mean_absolute_error + +from ..utils import is_numeric_array +from .data import Data +from ..math import cho_solve + +class _BaseModel(BaseEstimator): + """ + Base class for all regression models + """ + + _estimator_type = "regressor" + + def fit(self, X): + raise NotImplementedError + + def predict(self, X): + return NotImplementedError + + def score(self, X, y=None): + """ + Make predictions on `X` and return a score + + :param X: Data object + :type X: object + :param y: Energies + :type y: array + :return: score + :rtype: float + """ + + # Make predictions + y_pred = self.predict(X) + + # Get the true values + if is_numeric_array(y): + pass + + elif isinstance(X, Data): + try: + y = X.energies[X._indices] + except: + print("No kernel energies found in data object in module %s" % self.__class__.__name__) + raise SystemExit + + else: + print("Expected variable 'X' to be Data object. Got %s" % str(X)) + raise SystemExit + + # Return the score + if self.scoring == 'mae': + return mean_absolute_error(y, y_pred) + elif self.scoring == 'neg_mae': + return - mean_absolute_error(y, y_pred) + elif self.scoring == 'rmsd': + return np.sqrt(mean_squared_error(y, y_pred)) + elif self.scoring == 'neg_rmsd': + return - np.sqrt(mean_squared_error(y, y_pred)) + elif self.scoring == 'neg_log_mae': + return - np.log(mean_absolute_error(y, y_pred)) + +class KernelRidgeRegression(_BaseModel): + """ + Standard Kernel Ridge Regression using a cholesky solver + """ + + def __init__(self, l2_reg=1e-10, scoring='neg_mae'): + """ + :param llambda: l2 regularization + :type llambda: float + :param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae') + :type scoring: string + """ + self.l2_reg = l2_reg + self.scoring = scoring + + self.alpha = None + + def fit(self, X, y=None): + """ + Fit the Kernel Ridge Regression model using a cholesky solver + + :param X: Data object or kernel + :type X: object or array + :param y: Energies + :type y: array + """ + + if isinstance(X, Data): + try: + K, y = X._kernel, X.energies[X._indices] + except: + print("No kernel matrix and/or energies found in data object in module %s" % self.__class__.__name__) + raise SystemExit + elif is_numeric_array(X) and X.ndim == 2 and X.shape[0] == X.shape[1] and y is not None: + K = X + else: + print("Expected variable 'X' to be kernel matrix or Data object. Got %s" % str(X)) + raise SystemExit + + + K[np.diag_indices_from(K)] += self.l2_reg + + self.alpha = cho_solve(K, y) + + def predict(self, X): + """ + Fit the Kernel Ridge Regression model using a cholesky solver + + :param X: Data object + :type X: object + :param y: Energies + :type y: array + """ + + # Check if model has been fit + if self.alpha is None: + print("Error: The %s model has not been trained yet" % self.__class__.__name__) + raise SystemExit + + if isinstance(X, Data): + try: + K = X._kernel + except: + print("No kernel matrix found in data object in module %s" % self.__class__.__name__) + raise SystemExit + elif is_numeric_array(X) and X.ndim == 2 and X.shape[1] == self.alpha.size: + K = X + elif is_numeric_array(X) and X.ndim == 2 and X.shape[0] == self.alpha.size: + K = X.T + else: + print("Expected variable 'X' to be kernel matrix or Data object. Got %s" % str(X)) + raise SystemExit + + return np.dot(K, self.alpha) diff --git a/qml/qmlearn/preprocessing.py b/qml/qmlearn/preprocessing.py new file mode 100644 index 000000000..cab329075 --- /dev/null +++ b/qml/qmlearn/preprocessing.py @@ -0,0 +1,237 @@ +# MIT License +# +# Copyright (c) 2018 Lars Andersen Bratholm +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +import copy + +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.linear_model import LinearRegression + +from .data import Data +from ..utils import is_numeric_array, get_unique, is_positive_integer_or_zero_array + +class AtomScaler(BaseEstimator): + """ + Subtracts any constant offset or linear dependency on the number of atoms of each element from the property + """ + + def __init__(self, data=None, elements='auto'): + """ + :param data: Data object (optional) + :type data: Data object + :param elements: Elements to support. If `elements='auto'` try to determine this automatically. + :type elements: array + :param normalize: Normalize the transformed data such that the standard deviation is 1 + :type normalize: bool + """ + # Shallow copy should be fine + self._set_data(data) + self.elements = elements + + # Initialize model + self.model = LinearRegression() + + def _preprocess_input(self, X): + """ + Convenience function that processes X in a way such that + X can both be a data object, or an array of indices. And y + can be either values to transform or None. + + :param X: Data object, floating values or integer array of indices + :type X: Data object or array + :param y: Values or None + :type y: array or None + :return: Nuclear charges and values to transform + :rtype: tuple + """ + + if isinstance(X, Data): + + self._check_data(X) + + data = copy.copy(X) + + # Part of the sklearn CV hack. + if not hasattr(data, '_indices'): + data._indices = np.arange(len(data)) + + if hasattr(data, '_has_transformed_labels'): + print("Error: Target data has already been transformed by %s" % self.__class__.__name__) + raise SystemExit + + transformed_labels = np.zeros(len(data), dtype=bool) + transformed_labels[data._indices] = True + data._has_transformed_labels = transformed_labels + + elif self.data and is_positive_integer_or_zero_array(X) \ + and max(X) <= self.data.natoms.size: + # A copy here might avoid some unintended behaviour + # if multiple models is used sequentially. + data = copy.copy(self.data) + data._indices = np.asarray(X, dtype=int).ravel() + + + if hasattr(data, '_has_transformed_labels'): + if any(data._has_transformed_labels[data._indices] == True): + print("Error: Target data has already been transformed by %s" % self.__class__.__name__) + raise SystemExit + data._has_transformed_labels[data._indices] = True + else: + transformed_labels = np.zeros(len(data.energies), dtype=bool) + transformed_labels[data._indices] = True + data._has_transformed_labels = transformed_labels + + else: + print("Expected X to be array of indices or Data object. Got %s" % str(X)) + raise SystemExit + + return data + + def _check_data(self, X): + if X.natoms is None: + print("Error: Empty Data object passed to the %s transformer" % self.__class__.__name__) + raise SystemExit + + if X.energies is None: + print("Error: Expected Data object to have non-empty attribute 'energies'" % self.__class__.__name__) + raise SystemExit + + + def _set_data(self, data): + if data: + self._check_data(data) + self.data = data + + def fit_transform(self, X, y=None): + """ + Fit and transform the data with a linear model. + Supports three different types of input. + 1) X is a list of nuclear charges and y is values to transform. + 2) X is an array of indices of which to transform. + 3) X is a data object + + :param X: List with nuclear charges or Data object. + :type X: list + :param y: Values to transform + :type y: array or None + :return: Array of transformed values or Data object, depending on input + :rtype: array or Data object + """ + + if not isinstance(X, Data) and y is not None: + data = None + nuclear_charges = X + else: + data = self._preprocess_input(X) + nuclear_charges = data.nuclear_charges[data._indices] + y = data.energies[data._indices] + + if self.elements == 'auto': + self.elements = get_unique(nuclear_charges) + else: + self._check_elements(nuclear_charges) + + + features = self._featurizer(nuclear_charges) + + delta_y = y - self.model.fit(features, y).predict(features) + + if data: + # Force copy + data.energies = data.energies.copy() + data.energies[data._indices] = delta_y + return data + else: + return delta_y + + def _check_elements(self, nuclear_charges): + """ + Check that the elements in the given nuclear_charges was + included in the fit. + """ + + elements_transform = get_unique(nuclear_charges) + if not np.isin(elements_transform, self.elements).all(): + print("Warning: Trying to transform molecules with elements", + "not included during fit in the %s method." % self.__class__.__name__, + "%s used in training but trying to transform %s" % (str(self.elements), str(elements_transform))) + + def _featurizer(self, X): + """ + Get the counts of each element as features. + """ + + n = len(X) + m = len(self.elements) + element_to_index = {v:i for i, v in enumerate(self.elements)} + features = np.zeros((n,m), dtype=int) + + for i, x in enumerate(X): + count_dict = {k:v for k,v in zip(*np.unique(x, return_counts=True))} + for key, value in count_dict.items(): + if key not in element_to_index: + continue + j = element_to_index[key] + features[i, j] = value + + return features + + def transform(self, X, y=None): + """ + Transform the data with the fitted linear model. + Supports three different types of input. + 1) X is a list of nuclear charges and y is values to transform. + 2) X is an array of indices of which to transform. + 3) X is a data object + + :param X: List with nuclear charges or Data object. + :type X: list + :param y: Values to transform + :type y: array or None + :return: Array of transformed values or Data object, depending on input + :rtype: array or Data object + """ + + if not isinstance(X, Data) and y is not None: + data = None + nuclear_charges = X + else: + data = self._preprocess_input(X) + nuclear_charges = data.nuclear_charges[data._indices] + y = data.energies[data._indices] + + self._check_elements(nuclear_charges) + + features = self._featurizer(nuclear_charges) + + delta_y = y - self.model.predict(features) + + if data: + # Force copy + data.energies = data.energies.copy() + data.energies[data._indices] = delta_y + return data + else: + return delta_y + diff --git a/qml/qmlearn/representations.py b/qml/qmlearn/representations.py new file mode 100644 index 000000000..5cece8918 --- /dev/null +++ b/qml/qmlearn/representations.py @@ -0,0 +1,800 @@ +# MIT License +# +# Copyright (c) 2018 Lars Andersen Bratholm +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +import copy +import itertools + +import numpy as np +from sklearn.base import BaseEstimator + +from .data import Data +from ..utils import is_positive_integer_or_zero_array, get_unique, get_pairs, is_string +from ..representations.frepresentations import fgenerate_coulomb_matrix +from ..representations.frepresentations import fgenerate_unsorted_coulomb_matrix +from ..representations.frepresentations import fgenerate_local_coulomb_matrix +from ..representations.frepresentations import fgenerate_atomic_coulomb_matrix +from ..representations.representations import get_slatm_mbtypes +from ..representations.representations import generate_slatm +from ..representations.facsf import fgenerate_acsf +from ..fchl.fchl_representations import generate_representation + +class _BaseRepresentation(BaseEstimator): + """ + Base class for representations + """ + + # Variables that has to be set in child methods + _representation_short_name = None + _representation_type = None + alchemy = None + + + def fit(self, X, y=None): + """ + The fit routine is needed for scikit-learn pipelines. + Must return self. + """ + raise NotImplementedError + + def transform(self, X): + """ + The transform routine is needed for scikit-learn pipelines. + Needs to store the representations in self.data.representations + and return self.data. + """ + raise NotImplementedError + + def _extract_data(self, X): + """ + Convenience function that processes X in a way such that + X can both be a data object or an array of indices. + + :param X: Data object or array of indices + :type X: Data object or array + :return: Data object + :rtype: Data object + """ + + if isinstance(X, Data): + self._check_data(X) + data = copy.copy(X) + if not hasattr(data, "_indices"): + data._indices = np.arange(len(data)) + + elif self.data and is_positive_integer_or_zero_array(X) \ + and max(X) <= self.data.natoms.size: + # A copy here might avoid some unintended behaviour + # if multiple models is used sequentially. + data = copy.copy(self.data) + data._indices = np.asarray(X, dtype=int).ravel() + else: + print("Expected X to be array of indices or Data object. Got %s" % str(X)) + raise SystemExit + + # Store representation type / name for later use + data._representation_type = self._representation_type + data._representation_short_name = self._representation_short_name + data._representation_alchemy = self.alchemy + + return data + + def _check_data(self, X): + if X.natoms is None: + print("Error: Empty Data object passed to %s" % self.__class__.__name__) + raise SystemExit + + def _set_data(self, data): + if data: + self._check_data(data) + + self.data = data + + def _check_elements(self, nuclear_charges): + """ + Check that the elements in the given nuclear_charges was + included in the fit. + """ + + elements_transform = get_unique(nuclear_charges) + if not np.isin(elements_transform, self.elements).all(): + print("Warning: Trying to transform molecules with elements", + "not included during fit in the %s method." % self.__class__.__name__, + "%s used in training but trying to transform %s" % (str(self.elements), str(element_transform))) + + # TODO Make it possible to pass data in other ways as well + # e.g. dictionary + def generate(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Representations of shape (n_samples, representation_size) + :rtype: array + """ + + return self.fit(X).transform(X)._representations + +class _MolecularRepresentation(_BaseRepresentation): + """ + Base class for molecular representations + """ + + _representation_type = "molecular" + +class _AtomicRepresentation(_BaseRepresentation): + """ + Base class for molecular representations + """ + + _representation_type = "atomic" + alchemy = None + +class CoulombMatrix(_MolecularRepresentation): + """ + Coulomb Matrix representation as described in 10.1103/PhysRevLett.108.058301 + + """ + + _representation_short_name = "cm" + + def __init__(self, data=None, size='auto', sorting="row-norm"): + """ + Coulomb Matrix representation of a molecule. + Sorting of the elements can either be done by ``sorting="row-norm"`` or ``sorting="unsorted"``. + A matrix :math:`M` is constructed with elements + + .. math:: + + M_{ij} = + \\begin{cases} + \\tfrac{1}{2} Z_{i}^{2.4} & \\text{if } i = j \\\\ + \\frac{Z_{i}Z_{j}}{\\| {\\bf R}_{i} - {\\bf R}_{j}\\|} & \\text{if } i \\neq j + \\end{cases}, + + where :math:`i` and :math:`j` are atom indices, :math:`Z` is nuclear charge and + :math:`\\bf R` is the coordinate in euclidean space. + If ``sorting = 'row-norm'``, the atom indices are reordered such that + + :math:`\\sum_j M_{1j}^2 \\geq \\sum_j M_{2j}^2 \\geq ... \\geq \\sum_j M_{nj}^2` + + The upper triangular of M, including the diagonal, is concatenated to a 1D + vector representation. + + If ``sorting = 'unsorted``, the elements are sorted in the same order as the input coordinates + and nuclear charges. + + The representation is calculated using an OpenMP parallel Fortran routine. + + :param size: The size of the largest molecule supported by the representation. + `size='auto'` will try to determine this automatically. + :type size: integer + :param sorting: How the atom indices are sorted ('row-norm', 'unsorted') + :type sorting: string + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + """ + + self.size = size + self.sorting = sorting + self._set_data(data) + + def fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + + data = self._extract_data(X) + + natoms = data.natoms#[data._indices] + + if self.size == 'auto': + self.size = max(natoms) + + if self.size < max(natoms): + print("Warning: Maximum size of system increased from %d to %d" + % (self.size, max(natoms))) + self.size = max(natoms) + + return self + + def transform(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Data object + :rtype: Data object + """ + + data = self._extract_data(X) + + nuclear_charges = data.nuclear_charges[data._indices] + coordinates = data.coordinates[data._indices] + natoms = data.natoms[data._indices] + + if max(natoms) > self.size: + print("Error: Representation can't be generated for molecule of size \ + %d when variable 'size' is %d" % (max(natoms), self.size)) + raise SystemExit + + representations = [] + if (self.sorting == "row-norm"): + for charge, xyz in zip(nuclear_charges, coordinates): + representations.append( + fgenerate_coulomb_matrix(charge, xyz, self.size)) + + elif (self.sorting == "unsorted"): + for charge, xyz in zip(nuclear_charges, coordinates): + representations.append( + fgenerate_unsorted_coulomb_matrix(charge, xyz, self.size)) + else: + print("ERROR: Unknown sorting scheme requested") + raise SystemExit + + data._representations = np.asarray(representations) + + return data + + +# TODO add reference +# TODO add support for atomic properties +# The representation can be calculated for a subset by either specifying +# ``indices = [0,1,...]``, where :math:`[0,1,...]` are the requested atom indices, +# or by specifying ``indices = 'C'`` to only calculate central carbon atoms. +class AtomicCoulombMatrix(_AtomicRepresentation): + """ + Atomic Coulomb Matrix representation as described in + + """ + + _representation_short_name = "acm" + alchemy = True + + def __init__(self, data=None, size=23, sorting="distance", central_cutoff=10.0, + central_decay=-1, interaction_cutoff=10.0, interaction_decay=-1): + """ + Creates a Coulomb Matrix representation of the local environment of a central atom. + For each central atom :math:`k`, a matrix :math:`M` is constructed with elements + + .. math:: + + M_{ij}(k) = + \\begin{cases} + \\tfrac{1}{2} Z_{i}^{2.4} \\cdot f_{ik}^2 & \\text{if } i = j \\\\ + \\frac{Z_{i}Z_{j}}{\\| {\\bf R}_{i} - {\\bf R}_{j}\\|} \\cdot f_{ik}f_{jk}f_{ij} & \\text{if } i \\neq j + \\end{cases}, + + where :math:`i`, :math:`j` and :math:`k` are atom indices, :math:`Z` is nuclear charge and + :math:`\\bf R` is the coordinate in euclidean space. + + :math:`f_{ij}` is a function that masks long range effects: + + .. math:: + + f_{ij} = + \\begin{cases} + 1 & \\text{if } \\|{\\bf R}_{i} - {\\bf R}_{j} \\| \\leq r - \Delta r \\\\ + \\tfrac{1}{2} \\big(1 + \\cos\\big(\\pi \\tfrac{\\|{\\bf R}_{i} - {\\bf R}_{j} \\| + - r + \Delta r}{\Delta r} \\big)\\big) + & \\text{if } r - \Delta r < \\|{\\bf R}_{i} - {\\bf R}_{j} \\| \\leq r - \Delta r \\\\ + 0 & \\text{if } \\|{\\bf R}_{i} - {\\bf R}_{j} \\| > r + \\end{cases}, + + where the parameters ``central_cutoff`` and ``central_decay`` corresponds to the variables + :math:`r` and :math:`\Delta r` respectively for interactions involving the central atom, + and ``interaction_cutoff`` and ``interaction_decay`` corresponds to the variables + :math:`r` and :math:`\Delta r` respectively for interactions not involving the central atom. + + if ``sorting = 'row-norm'``, the atom indices are ordered such that + + :math:`\\sum_j M_{1j}(k)^2 \\geq \\sum_j M_{2j}(k)^2 \\geq ... \\geq \\sum_j M_{nj}(k)^2` + + if ``sorting = 'distance'``, the atom indices are ordered such that + + .. math:: + + \\|{\\bf R}_{1} - {\\bf R}_{k}\\| \\leq \\|{\\bf R}_{2} - {\\bf R}_{k}\\| + \\leq ... \\leq \\|{\\bf R}_{n} - {\\bf R}_{k}\\| + + The upper triangular of M, including the diagonal, is concatenated to a 1D + vector representation. + + The representation is calculated using an OpenMP parallel Fortran routine. + + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + :param size: The maximum number of atoms within the cutoff radius supported by the representation. + `size='auto'` will try to determine this automatically. + :type size: integer + :param sorting: How the atom indices are sorted ('row-norm', 'distance') + :type sorting: string + :param central_cutoff: The distance from the central atom, where the coulomb interaction + element will be zero + :type central_cutoff: float + :param central_decay: The distance over which the the coulomb interaction decays from full to none + :type central_decay: float + :param interaction_cutoff: The distance between two non-central atom, where the coulomb interaction + element will be zero + :type interaction_cutoff: float + :param interaction_decay: The distance over which the coulomb interaction decays from full to none + :type interaction_decay: float + """ + + self._set_data(data) + self.size = size + self.sorting = sorting + self.central_cutoff = central_cutoff + self.central_decay = central_decay + self.interaction_cutoff = interaction_cutoff + self.interaction_decay = interaction_decay + + def fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + data = self._extract_data(X) + + natoms = data.natoms#[data._indices] + + if self.size == 'auto': + self.size = min(max(natoms), 2 * self.central_cutoff**3) + + return self + + def transform(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Data object + :rtype: Data object + """ + + data = self._extract_data(X) + + nuclear_charges = data.nuclear_charges[data._indices] + coordinates = data.coordinates[data._indices] + natoms = data.natoms[data._indices] + + representations = [] + if (self.sorting == "row-norm"): + for charge, xyz, n in zip(nuclear_charges, coordinates, natoms): + representations.append(fgenerate_local_coulomb_matrix(np.arange(n)+1, n, charge, + xyz, n, self.size, self.central_cutoff, + self.central_decay, self.interaction_cutoff, self.interaction_decay)) + + elif (self.sorting == "distance"): + for charge, xyz, n in zip(nuclear_charges, coordinates, natoms): + representations.append(fgenerate_atomic_coulomb_matrix(np.arange(1,n+1), n, charge, + xyz, n, self.size, self.central_cutoff, + self.central_decay, self.interaction_cutoff, self.interaction_decay)) + + else: + print("ERROR: Unknown sorting scheme requested") + raise SystemExit + + data._representations = representations + + return data + + +class _SLATM(object): + """ + Routines for global and local SLATM + """ + + def _fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + data = self._extract_data(X) + + if is_string(self.elements) and self.elements == 'auto' \ + and is_string(self.element_pairs) and self.element_pairs == 'auto': + nuclear_charges = data.nuclear_charges#[data._indices] + self.elements = get_unique(nuclear_charges) + self.element_pairs = get_slatm_mbtypes(nuclear_charges) + elif not is_string(self.elements) and is_string(self.element_pairs) \ + and self.element_pairs == 'auto': + self.element_pairs = get_slatm_mbtypes(self.elements * 3) + elif is_string(self.elements) and self.elements == 'auto' and \ + not is_string(self.element_pairs): + self.elements = get_unique(self.element_pairs) + + return self + + def _transform(self, X, local): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param local: Use local or global version + :type local: bool + :return: Data object + :rtype: Data object + """ + + data = self._extract_data(X) + + nuclear_charges = data.nuclear_charges[data._indices] + coordinates = data.coordinates[data._indices] + natoms = data.natoms[data._indices] + + # Check that the molecules being transformed doesn't contain elements + # not used in the fit. + self._check_elements(nuclear_charges) + + representations = [] + for charge, xyz in zip(nuclear_charges, coordinates): + representations.append( + np.asarray( + generate_slatm(xyz, charge, self.element_pairs, local=local, + sigmas=[self.sigma2, self.sigma3], + dgrids=[self.dgrid2, self.dgrid3], rcut=self.rcut, + alchemy=self.alchemy, rpower=-self.rpower))) + + data._representations = np.asarray(representations) + + return data + +# TODO add reference +class GlobalSLATM(_SLATM, _MolecularRepresentation): + """ + Global version of SLATM. + """ + + _representation_short_name = "slatm" + + def __init__(self, data=None, sigma2=0.05, sigma3=0.05, dgrid2=0.03, + dgrid3=0.03, rcut=4.8, alchemy=False, rpower=-6, elements='auto', + element_pairs='auto'): + """ + Generate Spectrum of London and Axillrod-Teller-Muto potential (SLATM) representation. + + :param sigma2: Width of Gaussian smearing function for 2-body part. + :type sigma2: float + :param sigma3: Width of Gaussian smearing function for 3-body part. + :type sigma3: float + :param dgrid2: The interval between two sampled internuclear distances. + :type dgrid2: float + :param dgrid3: The interval between two sampled internuclear angles. + :type dgrid3: float + :param rcut: Cut-off radius. + :type rcut: float + :param alchemy: Use the alchemy version of SLATM. + :type alchemy: bool + :param rpower: The scaling power of R in 2-body potential. + :type rpower: float + :param elements: Atomnumber of elements that the representation should support. + `elements='auto'` will try to determine this automatically. + :type elements: list + :param element_pairs: Atomnumbers of element pairs that the representation should support. + `element_pairs='auto'` will try to determine this automatically. + :type element_pairs: list + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + """ + + self._set_data(data) + self.sigma2 = sigma2 + self.sigma3 = sigma3 + self.dgrid2 = dgrid2 + self.dgrid3 = dgrid3 + self.rcut = rcut + self.alchemy = alchemy + self.rpower = rpower + # Will be changed during fit + self.elements = elements + self.element_pairs = element_pairs + + def fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + + return self._fit(X, y) + + def transform(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Data object + :rtype: Data object + """ + + return self._transform(X, False) + + +# TODO add reference +class AtomicSLATM(_SLATM, _AtomicRepresentation): + """ + Local version of SLATM. + """ + + _representation_short_name = "aslatm" + + def __init__(self, data=None, sigma2=0.05, sigma3=0.05, dgrid2=0.03, + dgrid3=0.03, rcut=4.8, alchemy=False, rpower=-6, elements='auto', + element_pairs='auto'): + """ + Generate Spectrum of London and Axillrod-Teller-Muto potential (SLATM) representation. + + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + :param sigma2: Width of Gaussian smearing function for 2-body part. + :type sigma2: float + :param sigma3: Width of Gaussian smearing function for 3-body part. + :type sigma3: float + :param dgrid2: The interval between two sampled internuclear distances. + :type dgrid2: float + :param dgrid3: The interval between two sampled internuclear angles. + :type dgrid3: float + :param rcut: Cut-off radius. + :type rcut: float + :param alchemy: Use the alchemy version of SLATM. + :type alchemy: bool + :param rpower: The scaling power of R in 2-body potential. + :type rpower: float + :param elements: Atomnumber of elements that the representation should support. + `elements='auto'` will try to determine this automatically. + :type elements: list + :param element_pairs: Atomnumbers of element pairs that the representation should support. + `element_pairs='auto'` will try to determine this automatically. + :type element_pairs: list + """ + + self._set_data(data) + self.sigma2 = sigma2 + self.sigma3 = sigma3 + self.dgrid2 = dgrid2 + self.dgrid3 = dgrid3 + self.rcut = rcut + self.alchemy = alchemy + self.rpower = rpower + # Will be changed during fit + self.elements = elements + self.element_pairs = element_pairs + + def fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + + return self._fit(X, y) + + def transform(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Data object + :rtype: Data object + """ + + return self._transform(X, True) + +class AtomCenteredSymmetryFunctions(_AtomicRepresentation): + """ + The variant of Atom-Centered Symmetry Functions used in 10.1039/C7SC04934J + """ + + _representation_short_name = "acsf" + alchemy = False + + def __init__(self, data=None, nbasis=3, precision=2, cutoff=5.0, elements='auto'): + """ + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + :param nbasis: Number of basis functions to use + :type nbasis: integer + :param cutoff: Cutoff radius + :type cutoff: float + :param precision: Precision in the basis functions. A value of 2 corresponds to the \ + basis functions intersecting at half maximum. Higher values makes \ + the basis functions narrower. + :type precision: float + :param elements: Atomnumber of elements that the representation should support. + `elements='auto'` will try to determine this automatically. + :type elements: list + """ + + self._set_data(data) + self.nbasis = nbasis + self.precision = precision + self.cutoff = cutoff + # Will be changed during fit + self.elements = elements + + def fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + data = self._extract_data(X) + + if is_string(self.elements) and self.elements == 'auto': + nuclear_charges = data.nuclear_charges#[data._indices] + self.elements = get_unique(nuclear_charges) + + return self + + def transform(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Data object + :rtype: Data object + """ + + data = self._extract_data(X) + + nuclear_charges = data.nuclear_charges[data._indices] + coordinates = data.coordinates[data._indices] + natoms = data.natoms[data._indices] + + # Check that the molecules being transformed doesn't contain elements + # not used in the fit. + self._check_elements(nuclear_charges) + + # Calculate parameters needed for fortran input. + # This is a heuristic that cuts down the number of hyper-parameters + # and might be subject to future change. + min_distance = 0.8 + Rs = np.linspace(min_distance, self.cutoff, self.nbasis) + eta = 4 * np.log(self.precision) * ((self.nbasis-1)/(self.cutoff-min_distance))**2 + Ts = np.linspace(0, np.pi, self.nbasis) + zeta = - np.log(self.precision) / np.log(np.cos(np.pi / (4 * (self.nbasis - 1)))**2) + n_elements = len(self.elements) + size = n_elements * self.nbasis + (n_elements * (n_elements + 1)) // 2 * self.nbasis ** 2 + + representations = [] + for charge, xyz, n in zip(nuclear_charges, coordinates, natoms): + representations.append( + fgenerate_acsf(xyz, charge, self.elements, Rs, Rs, Ts, + eta, eta, zeta, self.cutoff, self.cutoff, n, size)) + + data._representations = np.asarray(representations) + + return data + +class FCHLRepresentation(_BaseRepresentation): + """ + The representation from 10.1063/1.5020710 + """ + + _representation_short_name = "fchl" + + def __init__(self, data=None, size='auto', cutoff=10.0): + """ + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + :param size: Max number of atoms in representation. `max_size='auto'` Will try to determine this + automatically. + :type size: integer + :param cutoff: Spatial cut-off distance - must be the same as used in the kernel function call. + :type cutoff: float + """ + + self._set_data(data) + self.size = size + self.cutoff = cutoff + + def fit(self, X, y=None): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :param y: Dummy argument for scikit-learn + :type y: NoneType + :return: self + :rtype: object + """ + data = self._extract_data(X) + + natoms = data.natoms#[data._indices] + + if self.size == 'auto': + self.size = max(natoms) + + if self.size < max(natoms): + print("Warning: Maximum size of system increased from %d to %d" + % (self.size, max(natoms))) + self.size = max(natoms) + + return self + + def transform(self, X): + """ + :param X: Data object or indices to use from the \ + Data object passed at initialization + :type X: Data object or array of indices + :return: Data object + :rtype: Data object + """ + + data = self._extract_data(X) + + # Store cutoff to make sure that the kernel cutoff is less + data._representation_cutoff = self.cutoff + + nuclear_charges = data.nuclear_charges[data._indices] + coordinates = data.coordinates[data._indices] + natoms = data.natoms[data._indices] + + if max(natoms) > self.size: + print("Error: Representation can't be generated for molecule of size \ + %d when variable 'size' is %d" % (max(natoms), self.size)) + raise SystemExit + + representations = [] + for charge, xyz in zip(nuclear_charges, coordinates): + representations.append( + generate_representation(xyz, charge, max_size=self.size, cut_distance=self.cutoff)) + + data._representations = np.asarray(representations) + + return data diff --git a/qml/representations/representations.py b/qml/representations/representations.py index bd6416857..d3c83ee86 100644 --- a/qml/representations/representations.py +++ b/qml/representations/representations.py @@ -32,7 +32,7 @@ from .frepresentations import fgenerate_eigenvalue_coulomb_matrix from .frepresentations import fgenerate_bob -from qml.data.alchemy import NUCLEAR_CHARGE +from qml.utils import NUCLEAR_CHARGE from .slatm import get_boa from .slatm import get_sbop diff --git a/qml/utils/__init__.py b/qml/utils/__init__.py new file mode 100644 index 000000000..d710f72d9 --- /dev/null +++ b/qml/utils/__init__.py @@ -0,0 +1,25 @@ +# MIT License +# +# Copyright (c) 2017-2018 Silvia Amabilino, Lars Andersen Bratholm +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from . import alchemy +from .utils import * +from .alchemy import ELEMENT_NAME, NUCLEAR_CHARGE diff --git a/qml/data/alchemy.py b/qml/utils/alchemy.py similarity index 100% rename from qml/data/alchemy.py rename to qml/utils/alchemy.py diff --git a/qml/aglaia/utils.py b/qml/utils/utils.py similarity index 94% rename from qml/aglaia/utils.py rename to qml/utils/utils.py index 87beac4d7..05b96cbd5 100644 --- a/qml/aglaia/utils.py +++ b/qml/utils/utils.py @@ -41,9 +41,6 @@ def is_positive_integer_or_zero(x): def is_string(x): return isinstance(x, str) -def is_none(x): - return isinstance(x, type(None)) - def is_dict(x): return isinstance(x, dict) @@ -88,10 +85,28 @@ def _is_integer_array(x): def is_positive_integer_array(x): return (_is_integer_array(x) and _is_positive_array(x)) - def is_positive_integer_or_zero_array(x): return (_is_integer_array(x) and _is_positive_or_zero_array(x)) +def get_unique(x): + """ + Gets all unique elements in lists of lists + """ + elements = list(set(item for l in x for item in l)) + return elements + +def get_pairs(x): + """ + Get all unique pairs. E.g. x = [1,2,3] will return + [[1, 1], [1, 2], [1, 3], [2, 2], [2, 3], [3, 3]] + """ + pairs = [] + for i,v in enumerate(x): + for w in x[i:]: + pairs.append([v,w]) + return pairs + + # ------------- ** Checking inputs ** -------------------------- def check_global_representation(x): @@ -157,14 +172,14 @@ def check_sizes(x, y=None, dy=None, classes=None): :return: None """ - if is_none(dy) and is_none(classes): + if dy is None and classes is None: if x.shape[0] != y.shape[0]: raise InputError("The descriptor and the properties should have the same first number of elements in the " "first dimension. Got %s and %s" % (x.shape[0], y.shape[0])) - elif is_none(y) and is_none(dy): - if is_none(classes): + elif y is None and dy is None: + if classes is None: raise InputError("Only x is not none.") else: if x.shape[0] != classes.shape[0]: @@ -173,7 +188,7 @@ def check_sizes(x, y=None, dy=None, classes=None): if x.shape[1] != classes.shape[1]: raise InputError("The number of atoms in the descriptor and in the classes is different: %s and %s." % (x.shape[1], classes.shape[1])) - elif is_none(dy) and not is_none(classes): + elif dy is None and classes is not None: if x.shape[0] != y.shape[0] or x.shape[0] != classes.shape[0]: raise InputError("All x, y and classes should have the first number of elements in the first dimension. Got " @@ -202,7 +217,7 @@ def check_dy(dy): :return: numpy array of floats of shape (n_samples, n_atoms, 3) """ - if is_none(dy): + if dy is None: approved_dy = dy else: if not is_array_like(dy): @@ -227,7 +242,7 @@ def check_classes(classes): :return: numpy array of ints of shape (n_samples, n_atoms) """ - if is_none(classes): + if classes is None: approved_classes = classes else: if not is_array_like(classes): @@ -244,13 +259,6 @@ def check_classes(classes): return approved_classes - - - - - - - # #def _is_numeric_array(x): # try: diff --git a/setup.py b/setup.py index a1d4e1ff9..0f30de11f 100755 --- a/setup.py +++ b/setup.py @@ -146,7 +146,9 @@ def setup_qml(): 'qml.kernels', 'qml.math', 'qml.representations', - 'qml.models', + 'qml.qmlearn', + 'qml.utils', + 'qml.models' ], # metadata diff --git a/test/test_armp.py b/test/test_armp.py index b7bc3ddad..4c6fb10ab 100644 --- a/test/test_armp.py +++ b/test/test_armp.py @@ -27,7 +27,7 @@ import numpy as np from qml.aglaia.aglaia import ARMP -from qml.aglaia.utils import InputError +from qml.utils import InputError import glob import os diff --git a/test/test_mrmp.py b/test/test_mrmp.py index ffe32e585..1f98d7602 100644 --- a/test/test_mrmp.py +++ b/test/test_mrmp.py @@ -27,7 +27,7 @@ import numpy as np from qml.aglaia.aglaia import MRMP -from qml.aglaia.utils import InputError +from qml.utils import InputError import glob import os import shutil diff --git a/test/test_neural_network.py b/test/test_neural_network.py index 775335f12..21ec5318b 100644 --- a/test/test_neural_network.py +++ b/test/test_neural_network.py @@ -29,7 +29,7 @@ # TODO relative imports from qml.aglaia.aglaia import MRMP -from qml.aglaia.utils import InputError +from qml.utils import InputError # ------------ ** All functions to test the inputs to the classes ** ---------------