Source code for clinamen.metamodel.metamodel
# -*- coding: utf-8 -*-
""" Copyright 2020 Marco Arrigoni
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from abc import ABC, abstractmethod
import copy
import numpy as np
from scipy.linalg import lstsq
from mpi4py import MPI
import torch
import gpytorch
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from clinamen.utils import get_structure_id
from clinamen.evpd import ROOT_LOGGER as logger
from .gp_models import ExactGPModel
comm = MPI.COMM_WORLD
proc_no = comm.Get_rank()
[docs]class GPMetaModel(ABC):
""" Class for creating a Gaussian Process metamodel.
"""
def __init__(self, descriptors_database,
mean_function=None, mean_function_kwargs=None,
kernel_function=None, kernel_function_kwargs=None,
likelihood_function=None, likelihood_function_kwargs=None,
optimizer=None, optimizer_kwargs=None,
marginal_likelihood_function=None,
marginal_likelihood_function_kwargs=None,
preprocessing_pipeline=None, std_value=1e-2):
"""
Parameters
----------
descriptors_database : object
Instance of ``DescriptorsDatabase`` class used to produce
some descriptors.
mean_function : ``gpytorch.means`` class
the class that once initialized produces the mean function.
For example: ``mean_function=gpytorch.means.LinearMean``
mean_function_kwargs : dict
the parameters to instantiate the ``mean_function`` class.
For example: ``mean_function_kwargs={'bias': False}``.
The actual mean function to be used in the GP model will
then be the object ``mean_function(**mean_function_kwargs)``
kernel_function : ``gpytorch.kernels`` class
the class that once initialized produces the kernel function
kernel_function_kwargs : dict
the parameters to instantiate the ``kernel_function`` class
likelihood_function : ``gpytorch.likelihoods`` class
the class that once initialized produces the likelihood function
likelihood_function_kwargs : dict
the parameters to instantiate the ``likelihood_function`` class
optimizer : ``torch.optim`` class
the class that once initialized produces the optimizer for
maximizing the marginal likelihood
optimizer_kwargs : dict
the parameters for initializing the optimizer
marginal_likelihood_function : ``gpytorch.mlls`` class
the marginal likelihood class to be maximized
marginal_likelihood_function_kwargs : dict
the parameters for initializing the
``marginal_likelihood_function``
preprocessing_pipeline : ``sklearn.pipeline.Pipeline`` object
A pipeline to preprocess the inputs data. The metamodel
is trained using the inputs outputted by this pipeline.
The trasformation of ``inputs`` must be accomplished by
calling ``preprocessing_pipeline.fit_transform(``inputs``)``
std_value : float. Default 1e-2 eV/atom
when the prediction standard deviation is smaller than
``std_value``, the fitness for the individual will be taken
from the surrogate model. Otherwise, they will be calculated
"""
self.descriptors_database = descriptors_database
self._mean_class = mean_function
self._mean_kwargs = mean_function_kwargs
self._kernel_class = kernel_function
self._kernel_kwargs = kernel_function_kwargs
self._likelihood_class = likelihood_function
self._likelihood_kwargs = likelihood_function_kwargs
self._optimizer_class = optimizer
self._optimizer_kwargs = optimizer_kwargs
self._mll_class = marginal_likelihood_function
self._mll_kwargs = marginal_likelihood_function_kwargs
self.preprocessing_pipeline = preprocessing_pipeline
self.std_value = std_value
self.train = True # If True, we are in training mode
self._loaded_state = False
[docs] @abstractmethod
def initialize_model(self, X_train, y_train):
""" This function initializes the GP model (``gpytorch.models``)
class, which means it initializes the mean function, kernel function,
and the likelihood. The function also initializes the optimizer
and the marginal log likelihood.
All these initialized objects must be assigned
to the respectie attributes:
- ``self._mean_function``
- ``self._kernel_function``
- ``self._likelihood``
- ``self._model``
- ``self._optimizer``
- ``self._mll``
which can then be accessed through the corresponding property
"""
@property
def mean_function(self):
return self._mean_function
@property
def kernel_function(self):
return self._kernel_function
@property
def likelihood(self):
return self._likelihood
@property
def model(self):
return self._model
@property
def optimizer(self):
return self._optimizer
@property
def mll(self):
return self._mll
@property
def loaded_state(self):
""" If True, it means that the state of the model has been
loaded from an external file
"""
return self._loaded_state
def _write_and_retrieve_descriptors(self, structures):
### get the descriptors
ids = [self.descriptors_database.get_structure_id(atom)
for atom in structures]
# we write and read from disk every time.
self.descriptors_database.write_descriptors(structures)
descriptors, _ = self.descriptors_database.read_descriptors(ids)
if proc_no == 0:
logger.info(
f'Obtained descriptors of shape {descriptors.shape}')
### pass the descriptors through the preprocessing pipeline
if self.preprocessing_pipeline is None:
X = descriptors
else:
if self.train:
X = self.preprocessing_pipeline.fit_transform(descriptors)
else:
X = self.preprocessing_pipeline.transform(descriptors)
if proc_no == 0:
logger.info(
f'Transformed descriptors of shape {X.shape}')
return X
[docs] def fit(self, structures, y, epochs=10000, stopping=1e-3,
stopping_epochs=10, verbose=False):
""" Train the meta-model
Parameters
----------
population : Iterable of structures of length n_samples.
y : ``np.ndarray`` of shape (n_samples, )
the total energy of the structures in ``structures``
epochs : int
the number of epochs for training the metamodel
stopping : float
the loss function minimum change to trigger early stopping
stopping_epochs : float
for how many epochs the loss function should change by less than
``stopping`` in order to enforce early stopping
verbose : bool
If True, prints the loss function every 100 epochs
"""
if proc_no == 0:
logger.info(
f'Training the metamodel with {len(structures)} structures')
self.train = True
X_train = self._write_and_retrieve_descriptors(structures)
# to torch tensors
X_train = torch.as_tensor(X_train, dtype=torch.float32)
y = torch.as_tensor(y, dtype=torch.float32)
### Initialize all parts of the GP model
self.initialize_model(X_train, y)
### train the metamodel
self.model.train()
self.likelihood.train()
prev_loss_val = np.inf
iter_stop = 0
for i in range(epochs):
self.optimizer.zero_grad()
output = self.model(X_train)
loss = -self.mll(output, y)
loss.backward()
loss_val = loss.detach().numpy()
if i == 0:
if proc_no == 0:
logger.info(f'Initial loss: {loss_val:.3f}')
if verbose and i%100 == 0:
if proc_no == 0:
logger.debug(f'Iter {i+1}/{epochs}. '
f'Loss = {loss_val:.3f}')
self.optimizer.step()
if np.abs(prev_loss_val - loss_val) < stopping:
iter_stop += 1
if iter_stop >= stopping_epochs:
if proc_no == 0:
logger.debug('Early stopping!')
break
else:
iter_stop = 0
prev_loss_val = loss_val
if proc_no == 0:
logger.info(f'Model trained after {i+1} epochs. '
f'Final loss: {loss_val:.3f}')
[docs] def predict(self, structures):
""" Predict the total energy for each individuals in
an iterable of structures.
Parameters
----------
population : Iterable of structures of length n_samples.
Returns:
--------
mean : np.array
the predicted energies
std : np.array
the predicted standard deviations
"""
if proc_no == 0:
logger.info(
f'Making predictions with the metamodel for {len(structures)} '
'structures')
self.train = False
X_test = self._write_and_retrieve_descriptors(structures)
# to torch tensors
X_test = torch.as_tensor(X_test, dtype=torch.float32)
### making predictions
self.model.eval()
self.likelihood.eval()
with torch.no_grad(), gpytorch.settings.fast_pred_var():
prediction_dist = self.likelihood(self.model(X_test))
mean = prediction_dist.mean.detach().numpy()
std = prediction_dist.stddev.detach().numpy()
return mean, std
[docs] def write_descriptors(self, structures):
""" Save the descriptors into the database hdf5 file.
"""
self.descriptors_database.write_descriptors(structures)
[docs] def read_descriptors(self):
""" Read the descriptors from the hdf5 database file.
"""
descriptors, _ = self.descriptors_database.read_descriptors()
return descriptors
[docs] def save_model(self, filename='model_state.pth'):
""" Save the GP model """
torch.save(self.model.state_dict(), filename)
def load_model_state(self, model_pth):
self.state_dict = torch.load(model_pth)
self._loaded_state = True
[docs]class ExactGPMetaModel(GPMetaModel):
""" Class for making a meta-model based on a Gaussian Process
Regressor with exact inference.
"""
[docs] def initialize_model(self, X_train, y_train):
self._mean_function = self._mean_class(**self._mean_kwargs)
self._kernel_function = self._kernel_class(
ard_num_dims=X_train.shape[-1], **self._kernel_kwargs)
self._likelihood = self._likelihood_class(**self._likelihood_kwargs)
self._model = ExactGPModel(X_train, y_train, self._mean_function,
self._kernel_function, self._likelihood,
scale_kernel_kwargs=dict())
self._optimizer = self._optimizer_class(self._model.parameters(),
**self._optimizer_kwargs)
self._mll = self._mll_class(self._likelihood, self._model,
**self._mll_kwargs)
[docs]class BaseExactGPMetaModel(ExactGPMetaModel):
""" Basic Exact GP regressor with a RBF kernel for minimal initialization
effort.
"""
def __init__(self, descriptors_database,
preprocessing_pipeline=None, std_value=1e-2):
"""
Parameters
----------
descriptors_database : object
Instance of ``DescriptorsDatabase`` class used to produce
some descriptors.
preprocessing_pipeline : ``sklearn.pipeline.Pipeline`` object
A pipeline to preprocess the inputs data. The metamodel
is trained using the inputs outputted by this pipeline.
The trasformation of ``inputs`` must be accomplished by
calling ``preprocessing_pipeline.fit_transform(``inputs``)``
std_value : float. Default 1e-2 eV/atom
when the prediction standard deviation is smaller than
``std_value``, the fitness for the individual will be taken
from the surrogate model. Otherwise, they will be calculated
"""
mean_func = gpytorch.means.ConstantMean
kernel_func = gpytorch.kernels.RBFKernel
likelihood = gpytorch.likelihoods.GaussianLikelihood
noise_constraint = gpytorch.constraints.Interval(1e-5, 1e-1)
likelihood_kwargs = dict(noise_covar=1e-3, noise_constraint=noise_constraint)
optimizer = torch.optim.Adam
optimizer_kwargs = dict(lr=1e-2, amsgrad=True)
marginal_log_likelihood = gpytorch.mlls.ExactMarginalLogLikelihood
super().__init__(descriptors_database, mean_func, dict(),
kernel_func, dict(), likelihood, likelihood_kwargs,
optimizer, optimizer_kwargs,
marginal_log_likelihood, dict(),
preprocessing_pipeline, std_value)
[docs]class BasePCAExactGPMetaModel(BaseExactGPMetaModel):
""" Basic Exact GP regressor with a RBF kernel for minimal initialization
effort. The inputs are automatically passed through a pipeline that scales
them and then performs PCA
"""
def __init__(self, descriptors_database,
scaler_kwargs, pca_kwargs,
std_value=1e-2):
"""
Parameters
----------
descriptors_database : object
Instance of ``DescriptorsDatabase`` class used to produce
some descriptors.
scaler_kwargs : dict
parameters to initialize a ``sklean`` ``StandardScaler`` object
pca_kwargs : dict
parameters to initialize a ``sklearn`` ``PCA`` object
std_value : float. Default 1e-2 eV/atom
when the prediction standard deviation is smaller than
``std_value``, the fitness for the individual will be taken
from the surrogate model. Otherwise, they will be calculated
"""
scaler = StandardScaler(**scaler_kwargs)
pca = PCA(**pca_kwargs)
steps = [
('scaler', scaler),
('pca', pca)
]
pipeline = Pipeline(steps)
super().__init__(descriptors_database, pipeline, std_value)