Source code for clinamen.descriptors.utils

# -*- coding: utf-8 -*-
""" Copyright 2020 Marco Arrigoni

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import numpy as np

import h5py

from clinamen import DESCRIPTORS_FLOAT_DTYPE as DTYPE


[docs]def write_descriptors(file_name, descriptors, descriptors_grads, ids,
                      name=None, flattened=True):
    """ Append new data to an existing dataset, if the dataset
    does not exist, create a new one

    Parameters
    ----------
    file_name : string
        the dataset name

    descriptors : 2D array-like of shape (n, d), if :attr:`flattened` is ``True``.
        n is the number of structures for which the descriptors
        were calculated. d is the dimensionality of the descriptors.
        If :attr:`flattened` is ``False``, descriptors can be a multidimensional
        array of shape (n, ...). 

    descriptors_grads : 2D array-like of shape (n, r) if :attr:`flattened` is ``True``.
        Otherwise, it can be a multidimensional array of shape (n, ...).
        It can also be ``None``. If not None, these are the (possibly flattened,
        if :attr:`flattened` is ``True``) Jacobians of the descriptors.

    name : string. Default None
        the system name. A tag that specifies the system when the
        dataset is created. If the dataset already exists, it checks
        the it corresponds to system ``name``

    ids : array-like of shape (n, )
        for each descriptor, is a string that identifies the structure
        coresponding to that descriptor
    """
    comp_kwargs = {'compression': 'gzip', 'compression_opts': 9}
    descriptors = np.atleast_2d(descriptors)
    if flattened:
        d = descriptors.shape[1]
        shape_x = (d, )
    else:
        d = descriptors.shape[1:]
        shape_x = d
    X = descriptors.copy()
    n_structs = X.shape[0]
    if n_structs != len(ids):
        raise ValueError(f'Have got descriptors for {n_structs} structures '
                         f'but {len(ids)} identifiers are given')
    shape_dx = tuple()
    if descriptors_grads is not None:
        descriptors_grads = np.atleast_2d(descriptors_grads)
        if flattened:
            r = descriptors_grads.shape[1]
            shape_dx = (r, )
        else:
            r = descriptors_grads.shape[1:]
            shape_dx = r
        DX_shape = descriptors_grads.shape
        DX = descriptors_grads.copy()
        if X.shape[0] != DX.shape[0]:
            raise ValueError(f'Have got descriptors for {X.shape[0]} '
                             f'structures but gradients for {DX.shape[0]}')
    new_dataset = True
    if os.path.isfile(file_name):
        new_dataset = False
        # get data to check dataset consistency
        with h5py.File(file_name, 'r') as f:
            sys_name = f['system'].attrs['name'].decode()
            existing_X_shape = tuple(f['system'].attrs['X_shape'])
            existing_DX_shape = tuple(f['system'].attrs['dX_shape'])
        if name is not None:
            c0 = (name == sys_name)
        else:
            c0 = True
        if not c0:
            raise ValueError(f'Dataset name tag {name} does not '
                             f'agree with the name tag {sys_name} '
                             f'of dataset {file_name}')
        if existing_X_shape != shape_x:
            raise ValueError(f'Descriptors have shape {shape_x}, but '
                             f'existing descriptors with shape '
                             f'{existing_X_shape} have been found in '
                             f'{file_name}')
        c1 = existing_DX_shape != shape_dx
        c2 = existing_DX_shape is not tuple()
        c3 = shape_dx is not tuple()
        if (c1 and c2) or (c1 and c3):
            raise ValueError(f'Descriptors Jacobians have shape {shape_dx}, '
                             f'but existing Jacobians with shape '
                             f'{existing_dX_shape} have been found in '
                             f'{file_name}')
    if new_dataset:
        with h5py.File(file_name, 'w') as f:
            system = f.create_group('system')
            if name is None:
                sys_name = 'system'
            else:
                sys_name = name
            system.attrs['name'] = np.string_(sys_name)
            system.attrs['X_shape'] = np.array(shape_x, dtype=np.uint16)
            system.attrs['dX_shape'] = np.array(shape_dx, dtype=np.uint16)
            dataX = f.create_group('X')
            dataX.attrs['name'] = np.string_('descriptors')
            for desc, struct_id in zip(X, ids):
                dataX.create_dataset(struct_id, data=desc, dtype=DTYPE,
                                     maxshape=shape_x, **comp_kwargs)
            if descriptors_grads is not None:
                dataDX = f.create_group('DX')
                dataDX.attrs['name'] = np.string_('descriptors_Jacobians')
                for desc_grad, struct_id in zip(DX, ids):
                    dataDX.create_dataset(struct_id, data=desc_grad,
                                          dtype=DTYPE,
                                          maxshape=shape_dx, **comp_kwargs)
    else:
        with h5py.File(file_name, 'a') as f:
            dataX = f['X']
            for desc, struct_id in zip(X, ids):
                if struct_id in dataX.keys():
                    continue
                dataX.create_dataset(struct_id, data=desc, dtype=DTYPE,
                                     maxshape=shape_x, **comp_kwargs)
            if descriptors_grads is not None:
                dataDX = f['DX']
                for desc_grad, struct_id in zip(DX, ids):
                    dataDX.create_dataset(struct_id, data=desc_grad,
                                          dtype=DTYPE,
                                          maxshape=shape_dx, **comp_kwargs)


[docs]def read_descriptors_by_id(file_name, ids):
    """ Given an id or a list thereof, it returns the eventual descriptors
    and Jacobians

    Parameters
    ----------
    file_name : string
        the hdf5 file from where the descriptors should be fetched

    ids : iterable
        the identity keys of the descriptors we want to fetch

    Returns
    -------
    X, DX indices: tuple
        the descriptors and their Jacobians, each as a list, and a list
        representing the indices corresponding to the ids in ``ids`` that
        were found.
        If the Jacobians are not present, None is returned
    """
    indices = []
    if not os.path.isfile(file_name):
        return None, None, []

    with h5py.File(file_name, 'r') as f:
        dataX = f['X']
        grads = True
        try:
            dataDX = f['DX']
        except KeyError:
            grads = False
        X_data = []
        DX_data = []
        for i, id_no in enumerate(ids):
            try:
                val = np.array(dataX[id_no], dtype=DTYPE)
            except KeyError:
                continue
            else:
                indices.append(i)
                X_data.append(val)
                if grads:
                    DX_data.append(np.array(dataDX[id_no], dtype=DTYPE))
        if grads:
            if len(X_data) != len(DX_data):
                raise ValueError(f'Found {len(X_data)} descriptors instances '
                                 f'but {len(DX_data)} Jacobians')
        if len(X_data) == 0:
            X_data = None
        if len(DX_data) == 0:
            DX_data = None
        return X_data, DX_data, indices