Source code for clinamen.clustering.stats_tools

# -*- coding: utf-8 -*-
""" Copyright 2020 Marco Arrigoni

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import time
import json
import numpy as np
from scipy.linalg import cho_factor, cho_solve
from scipy.special import erf
import matplotlib.pyplot as plt


[docs]def read_cmaes_status(json_status): """ Read and parse a CMAES status Parameters ---------- json_status : string the path to the json file describing the CMAES status Returns ------- data : dict the dictionary with the retrieved data """ with open(json_status, 'r') as fp: data = json.load(fp) for key, value in data['cmaes'].items(): if key in ['mean', 'covariance', 'p_sigma', 'p_c']: data['cmaes'][key] = np.array(value) return data
[docs]def calculate_KL_divergence_gaussians(mean_1, mean_2, covariance_1, covariance_2): """ Calculate the Kullback-Leibler divergence between two Gaussians: KL(G1 || G2) = E_1[ln G1 - ln G2] Parameters ---------- mean_1, mean_2 : 1D np.ndarray the mean vectors of the distributions covariance_1, covariance_2 : 2D np.ndarray the covariance matrices of the distributions Returns ------- divergence : float the KL divergence """ if not (len(mean_1) == len(mean_2) or covariance_1.shape == covariance_2.shape): raise ValueError('Shapes must match') n = len(mean_1) L1, low1 = cho_factor(covariance_1, lower=True) L2, low2 = cho_factor(covariance_2, lower=True) log1 = 2*np.sum(np.log(np.diag(L1))) log2 = 2*np.sum(np.log(np.diag(L2))) mean_diff = mean_2 - mean_1 term1 = cho_solve((L2, low2), covariance_1) term2 = cho_solve((L2, low2), mean_diff) log_term = log2 - log1 divergence = (log_term - n + np.trace(term1) + np.dot(mean_diff, term2)) return 0.5*divergence
[docs]def calculate_B_distance_gaussians(mean_1, mean_2, covariance_1, covariance_2): """ Calculates the Bhattacharyya distance between two normal distributions. Parameters ---------- mean_1, mean_2 : 1D np.ndarray the mean vectors of the distributions covariance_1, covariance_2 : 2D np.ndarray the covariance matrices of the distributions Returns ------- distance : float the Bhattacharyya distance """ if not (len(mean_1) == len(mean_2) or covariance_1.shape == covariance_2.shape): raise ValueError('Shapes must match') av_covariance = 0.5*(covariance_1 + covariance_2) L1, low1 = cho_factor(covariance_1, lower=True) L2, low2 = cho_factor(covariance_2, lower=True) Lav, lowav = cho_factor(av_covariance, lower=True) log1 = 2*np.sum(np.log(np.diag(L1))) log2 = 2*np.sum(np.log(np.diag(L2))) log_av = 2*np.sum(np.log(np.diag(Lav))) mean_diff = mean_1 - mean_2 term1 = cho_solve((Lav, lowav), mean_diff) term1 = np.dot(mean_diff, term1) distance = term1/8 + 0.5*(log_av - 0.5*(log1 + log2)) return distance
[docs]def calculate_B_coefficient_gaussians(mean_1, mean_2, covariance_1, covariance_2): """ Calculates the Bhattacharyya coefficient between two normal distributions Parameters ---------- mean_1, mean_2 : 1D np.ndarray the mean vectors of the distributions covariance_1, covariance_2 : 2D np.ndarray the covariance matrices of the distributions Returns ------- b_coeff : float the Bhattacharyya coefficient """ distance = calculate_B_distance_gaussians(mean_1, mean_2, covariance_1, covariance_2) b_coeff = np.exp(-distance) return b_coeff
[docs]def calculate_H_distance_gaussians(mean_1, mean_2, covariance_1, covariance_2): """ Calculate the Hellinger distance between two gaussians. Parameters ---------- mean_1, mean_2 : 1D np.ndarray the mean vectors of the distributions covariance_1, covariance_2 : 2D np.ndarray the covariance matrices of the distributions Returns ------- distance : float the Hellinger distance """ b_coeff = calculate_B_coefficient_gaussians(mean_1, mean_2, covariance_1, covariance_2) return np.sqrt(1 - b_coeff)
[docs]def integral_multivariate_standard_normal_rectangular_region(region): """ Compute the probability that a normal standard vector assumes values in a rectangular region Parameters ---------- region : tuple of 2-ple region = ((a_1, b_1), (a_2, b_2), ... , (a_k, b_k)) the number of tuples gives the dimension of the random vector. Each 2-ple contains the initial and final integration limits on the considered direction Returns ------- probability : float the corresponding probability """ k = len(region) probability = 1 for reg in region: value = erf(reg[1]/np.sqrt(2)) - erf(reg[0]/np.sqrt(2)) value /= 2 print(value) probability *= value return probability