Source code for tropygal.base_funcs

import numpy as np
from scipy.special import gamma as Gamma
from scipy.special import psi # digamma function
from scipy.spatial import KDTree
from scipy import integrate

#pi2 = np.pi**2.
#-----------------
[docs] def V_d(dim): """ Volume of (unit-radius) hyper-sphere of dimension d Parameters ---------- dim: int dimension Returns ------- float number the volume [pi^(dim/2.)]/Gamma(dim/2. + 1) """ # Particular cases (for performance): if (dim==1): return 2. if (dim==2): return np.pi if (dim==3): # return 4.*np.pi/3. return 4.18879020478639 if (dim==4): # return pi2/2. return 4.93480220054468 if (dim==5): # return 8.*pi2/15. return 5.26378901391432 if (dim==6): # return pi2*np.pi/6. return 5.16771278004997 # From here, V_d starts decreasing with d (the curse of dimensionality) return np.pi**(dim/2.)/Gamma(dim/2. + 1)
#----------------------------- # Leandro - Aug 12, 2025 - begin:
[docs] def density(data, data_base=None, k=1, correct_bias=False, vol_correction='cube', l_cube_over_d=None, workers=-1): """ Estimate the density (pdf) for best use in NN entropy estimate. f_i = 1/[ (M-1) * exp(-psi(k)) * V_d * D^d ], where: M is size of sample building the tree psi is the digamma function V_d is the volume of unitary hypersphere in d-dimensions D is the Euclidean distance to kth neighbor Parameters ---------- data: array [N, dim] Data points to be queried data_base: array [M, dim] Data points of sample that will serve to build the tree, while previous data array is for the query points. If data_base is not provided, it's assumed that data_base = data and M = N k: in number kth nearest neighbor correct_bias: Boolean If correct for bias due to boundary effects as proposed by Charzynska & Gambin 2015 vol_correction: string strategy for correction: - 'cube': the support is assumed to be a paralelepiped, and the volume around each point is a cube l_cube_over_d: float Side of cube around each point divided by D, the distance to the k-neighbor. A typically good choice is a cube inscribed in the sphere, i.e. side/D = 2/sqrt(dim). If None, this is set by default workers: int (default: -1, meaning using all CPUs available). Number of CPUs to be used to parallelize the seacrh for NNs. Returns ------- float density estimate f_i = 1/[ (M-1) * exp(-psi(k)) * V_d * D^d ] References ---------- .. [1] N. Leonenko, L. Pronzato, V. Savani, "A class of Rényi information estimators for multidimensional densities.", Ann. Statist. 36 (5) 2153 - 2182, 2008 """ if (data_base is not None): if (len(np.shape(data)) != len(np.shape(data_base))): raise ValueError("tropygal: Data arrays should be of form [N, dim] and [M, dim], or 1D arrays") if (len(np.shape(data)) == 1): dim = 1 data = np.reshape(data, (len(data), 1)) if (data_base is not None): data_base = np.reshape(data_base, (len(data_base), 1)) elif (len(np.shape(data)) == 2): if (data_base is not None) and (np.shape(data)[1] != np.shape(data_base)[1]): raise ValueError("tropygal: Data arrays should be of form [N, dim] and [M, dim], or 1D arrays") dim = np.shape(data)[1] else: raise ValueError("tropygal: Data array(s) should be of form [N, dim], or 1D array(s)") if not isinstance(k, (int, np.integer)): raise TypeError(f"tropygal: k must be an integer, got {type(dim).__name__}") N = np.shape(data)[0] if (data_base is not None): M = np.shape(data_base)[0] tree = KDTree(data_base, leafsize=10) # default leafsize=10 else: M = N tree = KDTree(data, leafsize=10) # default leafsize=10 dist, ind = tree.query(data, k=k+1, workers=workers) # workers is number of threads. -1 means all threads; k+1 because the first is the particle itself # Improve correction for points with D_NN = 0 (critical for bootstraps, where several points are repeated in the data). # In the new version, if D_NN=0, we consider these as same points and just consider them to be copies of the same point, with D_NN to the next neighbor with D_NN >0 dist_kNN = dist[:,k].copy() idx_to_fix = np.where(dist_kNN == 0)[0] if len(idx_to_fix) > 0: # Number of points with zero distance (typically, if not zero, very small compared to N, unless using bootstrap samples) #print ('tropygal:', len(idx_to_fix),' points with zero D_NN. Taking next neighbors until D_NN>0...') # For points with D_NN=0, we look for the next neighbor, but there may be points repeated several times. # In bootstraps, the probability por a point to be picked k times is ~ e^{-1}/k!. For k=10 (k=15), this is ~ 10^{-7} (~ 10^{-13}). So, for samples w/ N<10^10, k+15 should be enough: max_k = min(k+15, N) # Query up to k+15 neighbors, but not all points dists_fixed, _ = tree.query(data[idx_to_fix], k=max_k+1, workers=workers) # Find the first nonzero distance for each affected point for i, dist_row in zip(idx_to_fix, dists_fixed): nonzero_NN = dist_row[dist_row > 0] # Ignore self-matches (zero distances) if len(nonzero_NN) > 0: dist_kNN[i] = nonzero_NN[k-1] # Take k-th first nonzero distance else: print ('tropygal: point', i, 'with no neighbor with D_NN>0 found.') f = np.exp(psi(k))/( (M-1)*V_d(dim)*dist_kNN**dim ) if (correct_bias==True): # The fraction of the volume around particle i inside the domain: log_frac_vol = np.zeros(N) if (vol_correction=='cube'): if l_cube_over_d is None: l_cube_over_d = 2/np.sqrt(dim) l_cube = l_cube_over_d * dist_kNN for j in range(dim): if (data_base is not None): xmax = max(data_base[:, j]) xmin = min(data_base[:, j]) else: xmax = max(data[:, j]) xmin = min(data[:, j]) y = np.minimum(np.maximum(data[:, j], xmin), xmax) dx_max_over_l_cube = (xmax - y) / l_cube # we only need to correct if this is < 1/2, i.e. if the volume of the cube goes beyond support dx_min_over_l_cube = (y - xmin) / l_cube # we only need to correct if this is < 1/2 needs_correc = dx_max_over_l_cube < 0.5 log_frac_vol[needs_correc] = log_frac_vol[needs_correc] + np.log(0.5 + dx_max_over_l_cube[needs_correc]) needs_correc = dx_min_over_l_cube < 0.5 log_frac_vol[needs_correc] = log_frac_vol[needs_correc] + np.log(0.5 + dx_min_over_l_cube[needs_correc]) correction = np.exp(log_frac_vol) else: # raise ValueError("tropygal: vol_correction is the volume around each point assumed for the bias correction. Current possible values: 'cube', 'actions' or 'sph'") raise ValueError("tropygal: vol_correction is the volume around each point assumed for the bias correction. Current possible values: 'cube'.") return f/correction else: return f
#-----------------------------
[docs] def entropy(data, mu=1, k=1, correct_bias=False, vol_correction='cube', l_cube_over_d=None, workers=-1): """ Estimate of (Shannon/Jaynes) differential entropy S = - int f ln (f/mu) d^dim x. The factor mu ensures that f/mu is dimensionless. It also stores the density of states if the DF is a function of integrals only. For precise estimates, we also want all (x_1, x_2..., x_dim) to be order unit. S is estimated as - (1/N) * sum_i ( ln(f_i) - ln(mu_i), where f_i is the estimate of the DF f around point/particle/star i For NN (Nerest Neighbor) method: From e.g. Eq. (10) in Leonenko, Pronzato, Savani (2008): f_i = 1/[ (N-1) * exp(-psi(k)) * V_d * D^d ], where: N is sample size psi is the digamma function V_d is the volume of unitary hypersphere in d-dimensions D is the Euclidean distance to kth neighbor Parameters ---------- data: array [N, dim] Data points mu: float number or array of size N It ensures the argument of ln() is dimensionless. If x -> = x/sigma_x, y -> y/sigma_y... -> mu = 1/(sigma_x*sigma_y...). It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3, in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively. k: int value kth nearest neighbor correct_bias: Boolean If correct for bias due to boundary effects as proposed by Charzynska & Gambin 2015 vol_correction: string strategy for correction: - 'cube': the support is assumed to be a paralelepiped, and the volume around each point is a cube l_cube_over_d: float side of cube around each point divided by D, the distance to the k-neighbor. A typically good choice is a cube inscribed in the sphere, i.e. side/D = 2/sqrt(dim). If None, this is set by default workers: int (default: -1, meaning using all CPUs available) Number of CPUs to be used to parallelize the seacrh for NNs. Returns ------- float entropy estimate -<ln(f/mu)> = - (1/N)*sum_i ln(f_i/mu_i) References ---------- .. [1] N. Leonenko, L. Pronzato, V. Savani, "A class of Rényi information estimators for multidimensional densities.", Ann. Statist. 36 (5) 2153 - 2182, 2008 """ f = density(data, data_base=None, k=k, correct_bias=correct_bias, vol_correction=vol_correction, l_cube_over_d=l_cube_over_d, workers=workers) return -np.mean(np.log(f)) + np.mean(np.log(mu)) # return S = - <ln (f/mu)>
#-----------------------------
[docs] def cross_entropy(data, data_base, mu=1, k=1, correct_bias=False, vol_correction='cube', l_cube_over_d=None, workers=-1): """ Estimate of the cross entropy H = - int f0 ln (f/mu) d^dim x. The factor mu ensures that f/mu is dimensionless and it also stores the density of states if the DF is a function of integrals only. For precise estimates, we also want all (x_1, x_2..., x_dim) to be order unit. H is estimated as (1/N) * sum_i=1^N ln(f_i), where f_i is the estimate of the DF f based on the dist. of point i in sample 1 to its kth neighbor in sample 2. For NN (Nerest Neighbor) method: From e.g. Eq. (11) in Leonenko, Pronzato, Savani (2008): f_i = 1/[ M * exp(-psi(k)) * V_d * D^d ], where: N is size of sample 1, M is size of sample 2, psi is the digamma function, V_d is the volume of unitary hypersphere in d-dimensions, D is the Euclidean distance of particle i in sample 1 to its kth neighbor in sample 2. Parameters ---------- data: array [N, dim] Data points of sample 1 data_base: array [M, dim] Data points of sample 2 mu: float number or array of size N It ensures the argument of ln() is dimensionless. If x -> = x/sigma_x, y -> y/sigma_y... -> mu = 1/(sigma_x*sigma_y...). It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3, in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively. k: int value kth nearest neighbor correct_bias: Boolean If correct for bias due to boundary effects as proposed by Charzynska & Gambin 2015 vol_correction: string strategy for correction: - 'cube': the support is assumed to be a paralelepiped, and the volume around each point is a cube l_cube_over_d: float Side of cube around each point divided by D, the distance to the k-neighbor. A typically good choice is a cube inscribed in the sphere, i.e. side/D = 2/sqrt(dim). If None, this is set by default workers: int (default: -1, meaning using all CPUs available). Number of CPUs to be used to parallelize the seacrh for NNs. Returns ------- float cross entropy estimate -<ln(f/mu)> = - (1/N)*sum_i=1^N ln(f_i/mu_i) """ f = density(data, data_base=data_base, k=k, correct_bias=correct_bias, vol_correction=vol_correction, l_cube_over_d=l_cube_over_d, workers=workers) return -np.mean(np.log(f)) + np.mean(np.log(mu)) # return S = - <ln (f/mu)>
#-----------------------------
[docs] def C_k(q, k=1): """ For Renyi entropy. Check Eq. (7) in Leonenko, Pronzato, Savani (2008) Parameters ---------- q: float number q index in Rényi entropy (must be != 1) k: int value kth nearest neighbor Returns ------- float C_k """ return (Gamma(k)/Gamma(k+1.-q))**(1./(1.-q))
#-----------------------------
[docs] def renyi_entropy(data, mu=1, q=2, k=1): """ Estimate of Rényi entropy S_q = [1/(1 - q)] ln int f (f/mu)^(q-1) d^dim x, for q != 1. The factor mu ensures that f/mu is dimensionless. S_q is estimated as [1/(1-q)] ln (1/N) * sum_i=1^N (f_i/mu_i)^(q-1), where f_i is the estimate of the DF f around point/particle/star i For NN (Nerest Neighbor) method: From e.g. Eq. (7) in Leonenko, Pronzato, Savani (2008): f_i = 1/[ (N-1) * C_k * V_d * D^dim ], where C_k = [Gamma(k)/Gamma(k+1-q)]^[1/(1-q)] Parameters ---------- data: array [N, dim] Data points mu: float number or array of size N It ensures the argument of ln() is dimensionless. If x' = x/sigma_x, y' = y/sigma_y... -> mu = 1/(sigma_x*sigma_y...). It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3, in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively. q: float value q-parameter of the entropy; needs to be q != 1 k: int value kth nearest neighbor Returns ------- float entropy estimate [1/(1 - q)] ln <(f/mu)^(q-1)> """ if (len(np.shape(data)) == 1): dim = 1 data = np.reshape(data, (len(data), 1)) elif (len(np.shape(data)) == 2): dim = np.shape(data)[1] else: raise ValueError("tropygal: Array with data should be of form [N, dim].") return np.nan if (q==1): raise ValueError ('tropygal: For the Rényi entropy, q needs to be != 1') return np.nan N = np.shape(data)[0] tree = KDTree(data, leafsize=10) # default leafsize=10 dist, ind = tree.query(data, k=k+1, workers=-1) # workers is number of threads. -1 means all threads # Improve correction for points with D_NN = 0 (critical for bootstraps, where several points are repeated in the data). # In the new version, if D_NN=0, we consider these as same points and just consider them to be copies of the same point, with D_NN to the next neighbor with D_NN >0 dist_kNN = dist[:,k].copy() idx_to_fix = np.where(dist_kNN == 0)[0] if len(idx_to_fix) > 0: # Number of points with zero distance (typically, if not zero, very small compared to N, unless using bootstrap samples) #print ('tropygal:', len(idx_to_fix),' points with zero D_NN. Taking next neighbors until D_NN>0...') # For points with D_NN=0, we look for the next neighbor, but there may be points repeated several times. # In bootstraps, the probability por a point to be picked k times is ~ e^{-1}/k!. For k=10, this is ~ 10^{-7}. So, for samples w/ N<10^7, k+15 should be enough: max_k = min(k+15, N) # Query up to k+15 neighbors, but not all points dists_fixed, _ = tree.query(data[idx_to_fix], k=max_k+1, workers=workers) # Find the first nonzero distance for each affected point for i, dist_row in zip(idx_to_fix, dists_fixed): nonzero_NN = dist_row[dist_row > 0] # Ignore self-matches (zero distances) if len(nonzero_NN) > 0: dist_kNN[i] = nonzero_NN[k-1] # Take k-th first nonzero distance else: print ('tropygal: point', i, 'with no neighbor with D_NN>0 found.') #D = dist_kNN[idx] f = 1./( (N-1) * C_k(q, k) * V_d(dim) * dist_kNN**dim ) return (1./(1-q))*np.log(np.mean((f/mu)**(q-1.)))
#-----------------------------
[docs] def tsallis_entropy(data, mu=1, q=2, k=1): """ Estimate of Tsallis entropy S_q = [1/(q - 1)][ 1 - int f (f/mu)^(q-1) d^dim x ], for q != 1. The factor mu ensures that f/mu is dimensionless. S_q is estimated as [1/(q - 1)] [ 1 - (1/N) * sum_i=1^N (f_i/mu_i)^(q-1) ], where f_i is the estimate of the DF f around point/particle/star i For NN (Nerest Neighbor) method: From e.g. Eq. (7) in Leonenko, Pronzato, Savani (2008): f_i = 1/[ (N-1) * C_k * V_d * D^dim ], where C_k = [Gamma(k)/Gamma(k+1-q)]^[1/(1-q)] Parameters ---------- data: array [N, dim] Data points mu: float number or array of size N It ensures the argument of ln() is dimensionless. If x -> = x/sigma_x, y -> y/sigma_y... -> mu = 1/(sigma_x*sigma_y...). It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3, in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively. q: int value q-parameter of the entropy; needs to be q != 1 k: int value kth nearest neighbor Returns ------- float entropy estimate [1/(q - 1)] [ 1 - <f^(q-1)> ] """ if (len(np.shape(data)) == 1): dim = 1 data = np.reshape(data, (len(data), 1)) elif (len(np.shape(data)) == 2): dim = np.shape(data)[1] else: raise ValueError("tropygal: Array with data should be of form [N, dim].") return np.nan if (q==1): raise ValueError ('tropygal: For the Tsallis entropy, q needs to be != 1.') return np.nan N = np.shape(data)[0] tree = KDTree(data, leafsize=10) # default leafsize=10 dist, ind = tree.query(data, k=k+1, workers=-1) # workers is number of threads. -1 means all threads # Improve correction for points with D_NN = 0 (critical for bootstraps, where several points are repeated in the data). # In the new version, if D_NN=0, we consider these as same points and just consider them to be copies of the same point, with D_NN to the next neighbor with D_NN >0 dist_kNN = dist[:,k].copy() idx_to_fix = np.where(dist_kNN == 0)[0] if len(idx_to_fix) > 0: # Number of points with zero distance (typically, if not zero, very small compared to N, unless using bootstrap samples) #print ('tropygal:', len(idx_to_fix),' points with zero D_NN. Taking next neighbors until D_NN>0...') # For points with D_NN=0, we look for the next neighbor, but there may be points repeated several times. # In bootstraps, the probability por a point to be picked k times is ~ e^{-1}/k!. For k=10, this is ~ 10^{-7}. So, for samples w/ N<10^7, k+15 should be enough: max_k = min(k+15, N) # Query up to k+15 neighbors, but not all points dists_fixed, _ = tree.query(data[idx_to_fix], k=max_k+1, workers=workers) # Find the first nonzero distance for each affected point for i, dist_row in zip(idx_to_fix, dists_fixed): nonzero_NN = dist_row[dist_row > 0] # Ignore self-matches (zero distances) if len(nonzero_NN) > 0: dist_kNN[i] = nonzero_NN[k-1] # Take k-th first nonzero distance else: print ('tropygal: point', i, 'with no neighbor with D_NN>0 found.') #D = dist_kNN[idx] f = 1./( (N-1) * C_k(q, k) * V_d(dim) * dist_kNN**dim ) return (1./(q - 1))*( 1. - np.mean((f/mu)**(q-1.)))