Source code for clinamen.clustering.stats_tools

# -*- coding: utf-8 -*-
""" Copyright 2020 Marco Arrigoni

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import time
import json
import numpy as np
from scipy.linalg import cho_factor, cho_solve
from scipy.special import erf
import matplotlib.pyplot as plt


[docs]def read_cmaes_status(json_status):
    """ Read and parse a CMAES status

    Parameters
    ----------
    json_status : string
        the path to the json file describing the CMAES status

    Returns
    -------
    data : dict
        the dictionary with the retrieved data
    """
    with open(json_status, 'r') as fp:
        data = json.load(fp)
        for key, value in data['cmaes'].items():
            if key in ['mean', 'covariance', 'p_sigma', 'p_c']:
                data['cmaes'][key] = np.array(value)
    return data


[docs]def calculate_KL_divergence_gaussians(mean_1, mean_2,
                                      covariance_1, covariance_2):
    """ Calculate the Kullback-Leibler divergence between two Gaussians:
    KL(G1 || G2) = E_1[ln G1 - ln G2]

    Parameters
    ----------
    mean_1, mean_2 : 1D np.ndarray
        the mean vectors of the distributions

    covariance_1, covariance_2 : 2D np.ndarray
        the covariance matrices of the distributions

    Returns
    -------
    divergence : float
        the KL divergence
    """
    if not (len(mean_1) == len(mean_2) or
            covariance_1.shape == covariance_2.shape):
        raise ValueError('Shapes must match')
    n = len(mean_1)
    L1, low1 = cho_factor(covariance_1, lower=True)
    L2, low2 = cho_factor(covariance_2, lower=True)
    log1 = 2*np.sum(np.log(np.diag(L1)))
    log2 = 2*np.sum(np.log(np.diag(L2)))
    mean_diff = mean_2 - mean_1
    term1 = cho_solve((L2, low2), covariance_1)
    term2 = cho_solve((L2, low2), mean_diff)
    log_term = log2 - log1
    divergence = (log_term - n +
                  np.trace(term1) + np.dot(mean_diff, term2))
    return 0.5*divergence


[docs]def calculate_B_distance_gaussians(mean_1, mean_2, covariance_1, covariance_2):
    """ Calculates the Bhattacharyya distance between two normal
    distributions.

    Parameters
    ----------
    mean_1, mean_2 : 1D np.ndarray
        the mean vectors of the distributions

    covariance_1, covariance_2 : 2D np.ndarray
        the covariance matrices of the distributions

    Returns
    -------
    distance : float
        the Bhattacharyya distance
    """
    if not (len(mean_1) == len(mean_2) or
            covariance_1.shape == covariance_2.shape):
        raise ValueError('Shapes must match')
    av_covariance = 0.5*(covariance_1 + covariance_2)
    L1, low1 = cho_factor(covariance_1, lower=True)
    L2, low2 = cho_factor(covariance_2, lower=True)
    Lav, lowav = cho_factor(av_covariance, lower=True)
    log1 = 2*np.sum(np.log(np.diag(L1)))
    log2 = 2*np.sum(np.log(np.diag(L2)))
    log_av = 2*np.sum(np.log(np.diag(Lav)))

    mean_diff = mean_1 - mean_2
    term1 = cho_solve((Lav, lowav), mean_diff)
    term1 = np.dot(mean_diff, term1)

    distance = term1/8 + 0.5*(log_av - 0.5*(log1 + log2))

    return distance


[docs]def calculate_B_coefficient_gaussians(mean_1, mean_2,
                                      covariance_1, covariance_2):
    """ Calculates the Bhattacharyya coefficient between two normal
    distributions

    Parameters
    ----------
    mean_1, mean_2 : 1D np.ndarray
        the mean vectors of the distributions

    covariance_1, covariance_2 : 2D np.ndarray
        the covariance matrices of the distributions

    Returns
    -------
    b_coeff : float
        the Bhattacharyya coefficient
    """
    distance = calculate_B_distance_gaussians(mean_1, mean_2,
                                              covariance_1, covariance_2)
    b_coeff = np.exp(-distance)

    return b_coeff


[docs]def calculate_H_distance_gaussians(mean_1, mean_2, covariance_1, covariance_2):

    """ Calculate the Hellinger distance between two gaussians. 

    Parameters
    ----------
    mean_1, mean_2 : 1D np.ndarray
        the mean vectors of the distributions

    covariance_1, covariance_2 : 2D np.ndarray
        the covariance matrices of the distributions

    Returns
    -------
    distance : float
        the Hellinger distance
    """
    b_coeff = calculate_B_coefficient_gaussians(mean_1, mean_2,
                                                covariance_1, covariance_2)
    return np.sqrt(1 - b_coeff)


[docs]def integral_multivariate_standard_normal_rectangular_region(region):
    """ Compute the probability that a normal standard vector assumes values
    in a rectangular region

    Parameters
    ----------
    region : tuple of 2-ple
        region = ((a_1, b_1), (a_2, b_2), ... , (a_k, b_k))
        the number of tuples gives the dimension of the random vector.
        Each 2-ple contains the initial and final integration limits on
        the considered direction

    Returns
    -------
    probability : float
        the corresponding probability
    """
    k = len(region)

    probability = 1

    for reg in region:
        value = erf(reg[1]/np.sqrt(2)) - erf(reg[0]/np.sqrt(2))
        value /= 2
        print(value)
        probability *= value

    return probability