Source code for clinamen.metamodel.metamodel

# -*- coding: utf-8 -*-
""" Copyright 2020 Marco Arrigoni

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from abc import ABC, abstractmethod
import copy
import numpy as np
from scipy.linalg import lstsq
from mpi4py import MPI

import torch
import gpytorch

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from clinamen.utils import get_structure_id
from clinamen.evpd import ROOT_LOGGER as logger
from .gp_models import ExactGPModel


comm = MPI.COMM_WORLD
proc_no = comm.Get_rank()


[docs]class GPMetaModel(ABC): """ Class for creating a Gaussian Process metamodel. """ def __init__(self, descriptors_database, mean_function=None, mean_function_kwargs=None, kernel_function=None, kernel_function_kwargs=None, likelihood_function=None, likelihood_function_kwargs=None, optimizer=None, optimizer_kwargs=None, marginal_likelihood_function=None, marginal_likelihood_function_kwargs=None, preprocessing_pipeline=None, std_value=1e-2): """ Parameters ---------- descriptors_database : object Instance of ``DescriptorsDatabase`` class used to produce some descriptors. mean_function : ``gpytorch.means`` class the class that once initialized produces the mean function. For example: ``mean_function=gpytorch.means.LinearMean`` mean_function_kwargs : dict the parameters to instantiate the ``mean_function`` class. For example: ``mean_function_kwargs={'bias': False}``. The actual mean function to be used in the GP model will then be the object ``mean_function(**mean_function_kwargs)`` kernel_function : ``gpytorch.kernels`` class the class that once initialized produces the kernel function kernel_function_kwargs : dict the parameters to instantiate the ``kernel_function`` class likelihood_function : ``gpytorch.likelihoods`` class the class that once initialized produces the likelihood function likelihood_function_kwargs : dict the parameters to instantiate the ``likelihood_function`` class optimizer : ``torch.optim`` class the class that once initialized produces the optimizer for maximizing the marginal likelihood optimizer_kwargs : dict the parameters for initializing the optimizer marginal_likelihood_function : ``gpytorch.mlls`` class the marginal likelihood class to be maximized marginal_likelihood_function_kwargs : dict the parameters for initializing the ``marginal_likelihood_function`` preprocessing_pipeline : ``sklearn.pipeline.Pipeline`` object A pipeline to preprocess the inputs data. The metamodel is trained using the inputs outputted by this pipeline. The trasformation of ``inputs`` must be accomplished by calling ``preprocessing_pipeline.fit_transform(``inputs``)`` std_value : float. Default 1e-2 eV/atom when the prediction standard deviation is smaller than ``std_value``, the fitness for the individual will be taken from the surrogate model. Otherwise, they will be calculated """ self.descriptors_database = descriptors_database self._mean_class = mean_function self._mean_kwargs = mean_function_kwargs self._kernel_class = kernel_function self._kernel_kwargs = kernel_function_kwargs self._likelihood_class = likelihood_function self._likelihood_kwargs = likelihood_function_kwargs self._optimizer_class = optimizer self._optimizer_kwargs = optimizer_kwargs self._mll_class = marginal_likelihood_function self._mll_kwargs = marginal_likelihood_function_kwargs self.preprocessing_pipeline = preprocessing_pipeline self.std_value = std_value self.train = True # If True, we are in training mode self._loaded_state = False
[docs] @abstractmethod def initialize_model(self, X_train, y_train): """ This function initializes the GP model (``gpytorch.models``) class, which means it initializes the mean function, kernel function, and the likelihood. The function also initializes the optimizer and the marginal log likelihood. All these initialized objects must be assigned to the respectie attributes: - ``self._mean_function`` - ``self._kernel_function`` - ``self._likelihood`` - ``self._model`` - ``self._optimizer`` - ``self._mll`` which can then be accessed through the corresponding property """
@property def mean_function(self): return self._mean_function @property def kernel_function(self): return self._kernel_function @property def likelihood(self): return self._likelihood @property def model(self): return self._model @property def optimizer(self): return self._optimizer @property def mll(self): return self._mll @property def loaded_state(self): """ If True, it means that the state of the model has been loaded from an external file """ return self._loaded_state def _write_and_retrieve_descriptors(self, structures): ### get the descriptors ids = [self.descriptors_database.get_structure_id(atom) for atom in structures] # we write and read from disk every time. self.descriptors_database.write_descriptors(structures) descriptors, _ = self.descriptors_database.read_descriptors(ids) if proc_no == 0: logger.info( f'Obtained descriptors of shape {descriptors.shape}') ### pass the descriptors through the preprocessing pipeline if self.preprocessing_pipeline is None: X = descriptors else: if self.train: X = self.preprocessing_pipeline.fit_transform(descriptors) else: X = self.preprocessing_pipeline.transform(descriptors) if proc_no == 0: logger.info( f'Transformed descriptors of shape {X.shape}') return X
[docs] def fit(self, structures, y, epochs=10000, stopping=1e-3, stopping_epochs=10, verbose=False): """ Train the meta-model Parameters ---------- population : Iterable of structures of length n_samples. y : ``np.ndarray`` of shape (n_samples, ) the total energy of the structures in ``structures`` epochs : int the number of epochs for training the metamodel stopping : float the loss function minimum change to trigger early stopping stopping_epochs : float for how many epochs the loss function should change by less than ``stopping`` in order to enforce early stopping verbose : bool If True, prints the loss function every 100 epochs """ if proc_no == 0: logger.info( f'Training the metamodel with {len(structures)} structures') self.train = True X_train = self._write_and_retrieve_descriptors(structures) # to torch tensors X_train = torch.as_tensor(X_train, dtype=torch.float32) y = torch.as_tensor(y, dtype=torch.float32) ### Initialize all parts of the GP model self.initialize_model(X_train, y) ### train the metamodel self.model.train() self.likelihood.train() prev_loss_val = np.inf iter_stop = 0 for i in range(epochs): self.optimizer.zero_grad() output = self.model(X_train) loss = -self.mll(output, y) loss.backward() loss_val = loss.detach().numpy() if i == 0: if proc_no == 0: logger.info(f'Initial loss: {loss_val:.3f}') if verbose and i%100 == 0: if proc_no == 0: logger.debug(f'Iter {i+1}/{epochs}. ' f'Loss = {loss_val:.3f}') self.optimizer.step() if np.abs(prev_loss_val - loss_val) < stopping: iter_stop += 1 if iter_stop >= stopping_epochs: if proc_no == 0: logger.debug('Early stopping!') break else: iter_stop = 0 prev_loss_val = loss_val if proc_no == 0: logger.info(f'Model trained after {i+1} epochs. ' f'Final loss: {loss_val:.3f}')
[docs] def predict(self, structures): """ Predict the total energy for each individuals in an iterable of structures. Parameters ---------- population : Iterable of structures of length n_samples. Returns: -------- mean : np.array the predicted energies std : np.array the predicted standard deviations """ if proc_no == 0: logger.info( f'Making predictions with the metamodel for {len(structures)} ' 'structures') self.train = False X_test = self._write_and_retrieve_descriptors(structures) # to torch tensors X_test = torch.as_tensor(X_test, dtype=torch.float32) ### making predictions self.model.eval() self.likelihood.eval() with torch.no_grad(), gpytorch.settings.fast_pred_var(): prediction_dist = self.likelihood(self.model(X_test)) mean = prediction_dist.mean.detach().numpy() std = prediction_dist.stddev.detach().numpy() return mean, std
[docs] def write_descriptors(self, structures): """ Save the descriptors into the database hdf5 file. """ self.descriptors_database.write_descriptors(structures)
[docs] def read_descriptors(self): """ Read the descriptors from the hdf5 database file. """ descriptors, _ = self.descriptors_database.read_descriptors() return descriptors
[docs] def save_model(self, filename='model_state.pth'): """ Save the GP model """ torch.save(self.model.state_dict(), filename)
def load_model_state(self, model_pth): self.state_dict = torch.load(model_pth) self._loaded_state = True
[docs]class ExactGPMetaModel(GPMetaModel): """ Class for making a meta-model based on a Gaussian Process Regressor with exact inference. """
[docs] def initialize_model(self, X_train, y_train): self._mean_function = self._mean_class(**self._mean_kwargs) self._kernel_function = self._kernel_class( ard_num_dims=X_train.shape[-1], **self._kernel_kwargs) self._likelihood = self._likelihood_class(**self._likelihood_kwargs) self._model = ExactGPModel(X_train, y_train, self._mean_function, self._kernel_function, self._likelihood, scale_kernel_kwargs=dict()) self._optimizer = self._optimizer_class(self._model.parameters(), **self._optimizer_kwargs) self._mll = self._mll_class(self._likelihood, self._model, **self._mll_kwargs)
[docs]class BaseExactGPMetaModel(ExactGPMetaModel): """ Basic Exact GP regressor with a RBF kernel for minimal initialization effort. """ def __init__(self, descriptors_database, preprocessing_pipeline=None, std_value=1e-2): """ Parameters ---------- descriptors_database : object Instance of ``DescriptorsDatabase`` class used to produce some descriptors. preprocessing_pipeline : ``sklearn.pipeline.Pipeline`` object A pipeline to preprocess the inputs data. The metamodel is trained using the inputs outputted by this pipeline. The trasformation of ``inputs`` must be accomplished by calling ``preprocessing_pipeline.fit_transform(``inputs``)`` std_value : float. Default 1e-2 eV/atom when the prediction standard deviation is smaller than ``std_value``, the fitness for the individual will be taken from the surrogate model. Otherwise, they will be calculated """ mean_func = gpytorch.means.ConstantMean kernel_func = gpytorch.kernels.RBFKernel likelihood = gpytorch.likelihoods.GaussianLikelihood noise_constraint = gpytorch.constraints.Interval(1e-5, 1e-1) likelihood_kwargs = dict(noise_covar=1e-3, noise_constraint=noise_constraint) optimizer = torch.optim.Adam optimizer_kwargs = dict(lr=1e-2, amsgrad=True) marginal_log_likelihood = gpytorch.mlls.ExactMarginalLogLikelihood super().__init__(descriptors_database, mean_func, dict(), kernel_func, dict(), likelihood, likelihood_kwargs, optimizer, optimizer_kwargs, marginal_log_likelihood, dict(), preprocessing_pipeline, std_value)
[docs]class BasePCAExactGPMetaModel(BaseExactGPMetaModel): """ Basic Exact GP regressor with a RBF kernel for minimal initialization effort. The inputs are automatically passed through a pipeline that scales them and then performs PCA """ def __init__(self, descriptors_database, scaler_kwargs, pca_kwargs, std_value=1e-2): """ Parameters ---------- descriptors_database : object Instance of ``DescriptorsDatabase`` class used to produce some descriptors. scaler_kwargs : dict parameters to initialize a ``sklean`` ``StandardScaler`` object pca_kwargs : dict parameters to initialize a ``sklearn`` ``PCA`` object std_value : float. Default 1e-2 eV/atom when the prediction standard deviation is smaller than ``std_value``, the fitness for the individual will be taken from the surrogate model. Otherwise, they will be calculated """ scaler = StandardScaler(**scaler_kwargs) pca = PCA(**pca_kwargs) steps = [ ('scaler', scaler), ('pca', pca) ] pipeline = Pipeline(steps) super().__init__(descriptors_database, pipeline, std_value)