Source code for tropygal.base_funcs

import numpy as np
from scipy.special import gamma as Gamma
from scipy.special import psi # digamma function
from scipy.spatial import KDTree
from scipy import integrate

#pi2 = np.pi**2.
#-----------------

[docs]
def V_d(dim):
    """ Volume of (unit-radius) hyper-sphere of dimension d

    Parameters
    ----------
    dim: int
         dimension

    Returns
    -------
    float number
      the volume [pi^(dim/2.)]/Gamma(dim/2. + 1)
    """
    # Particular cases (for performance):
    if (dim==1):
        return 2.
    if (dim==2):
        return np.pi
    if (dim==3):
        # return 4.*np.pi/3.
        return 4.18879020478639
    if (dim==4):
        # return pi2/2.
        return 4.93480220054468
    if (dim==5):
        # return 8.*pi2/15.
        return 5.26378901391432
    if (dim==6):
        # return pi2*np.pi/6.
        return 5.16771278004997 # From here, V_d starts decreasing with d (the curse of dimensionality)
    
    return np.pi**(dim/2.)/Gamma(dim/2. + 1)

#-----------------------------
# Leandro - Aug 12, 2025 - begin:

[docs]
def density(data, data_base=None, k=1, correct_bias=False, vol_correction='cube', l_cube_over_d=None, workers=-1):
    """
    Estimate the density (pdf) for best use in NN entropy estimate.

    f_i = 1/[ (M-1) * exp(-psi(k)) * V_d * D^d ], where:
    M is size of sample building the tree
    psi is the digamma function
    V_d is the volume of unitary hypersphere in d-dimensions
    D is the Euclidean distance to kth neighbor

    Parameters
    ----------
    data: array [N, dim]
       Data points to be queried
    data_base: array [M, dim]
       Data points of sample that will serve to build the tree, while previous data array is for the query points. If data_base is not provided, it's assumed that data_base = data and M = N
    k: in number
       kth nearest neighbor
    correct_bias: Boolean
       If correct for bias due to boundary effects as proposed by Charzynska & Gambin 2015
    vol_correction: string
       strategy for correction:
       - 'cube': the support is assumed to be a paralelepiped, and the volume around each point is a cube
    l_cube_over_d: float
      Side of cube around each point divided by D, the distance to the k-neighbor.
      A typically good choice is a cube inscribed in the sphere, i.e. side/D = 2/sqrt(dim). If None, this is set by default
    workers: int (default: -1, meaning using all CPUs available).
       Number of CPUs to be used to parallelize the seacrh for NNs.

    Returns
    -------
    float
       density estimate f_i = 1/[ (M-1) * exp(-psi(k)) * V_d * D^d ]
    
    References
    ----------                                                                                                                            
    .. [1] N. Leonenko, L. Pronzato, V. Savani, "A class of Rényi information estimators for multidimensional densities.", Ann. Statist. 36 (5) 2153 - 2182, 2008

    """
    
    if (data_base is not None):
        if (len(np.shape(data)) != len(np.shape(data_base))):
            raise ValueError("tropygal: Data arrays should be of form [N, dim] and [M, dim], or 1D arrays")
    if (len(np.shape(data)) == 1):
        dim = 1
        data = np.reshape(data, (len(data), 1))
        if (data_base is not None):
            data_base = np.reshape(data_base, (len(data_base), 1))
    elif (len(np.shape(data)) == 2):
        if (data_base is not None) and (np.shape(data)[1] != np.shape(data_base)[1]):
            raise ValueError("tropygal: Data arrays should be of form [N, dim] and [M, dim], or 1D arrays")
        dim = np.shape(data)[1]
    else:
        raise ValueError("tropygal: Data array(s) should be of form [N, dim], or 1D array(s)")

    if not isinstance(k, (int, np.integer)):
        raise TypeError(f"tropygal: k must be an integer, got {type(dim).__name__}")
    
    N = np.shape(data)[0]
    if (data_base is not None):
        M = np.shape(data_base)[0]
        tree = KDTree(data_base, leafsize=10) # default leafsize=10
    else:
        M = N
        tree = KDTree(data, leafsize=10) # default leafsize=10
        
    dist, ind = tree.query(data, k=k+1, workers=workers) # workers is number of threads. -1 means all threads; k+1 because the first is the particle itself

    # Improve correction for points with D_NN = 0 (critical for bootstraps, where several points are repeated in the data).
    # In the new version, if D_NN=0, we consider these as same points and just consider them to be copies of the same point, with D_NN to the next neighbor with D_NN >0

    dist_kNN = dist[:,k].copy()
    idx_to_fix = np.where(dist_kNN == 0)[0]
    if len(idx_to_fix) > 0: # Number of points with zero distance (typically, if not zero, very small compared to N, unless using bootstrap samples)
        #print ('tropygal:', len(idx_to_fix),' points with zero D_NN. Taking next neighbors until D_NN>0...')
        # For points with D_NN=0, we look for the next neighbor, but there may be points repeated several times.
        # In bootstraps, the probability por a point to be picked k times is ~ e^{-1}/k!. For k=10 (k=15), this is ~ 10^{-7} (~ 10^{-13}). So, for samples w/ N<10^10, k+15 should be enough:
        max_k = min(k+15, N)  # Query up to k+15 neighbors, but not all points
        dists_fixed, _ = tree.query(data[idx_to_fix], k=max_k+1, workers=workers)

        # Find the first nonzero distance for each affected point
        for i, dist_row in zip(idx_to_fix, dists_fixed):
            nonzero_NN = dist_row[dist_row > 0]  # Ignore self-matches (zero distances)
            if len(nonzero_NN) > 0:
                dist_kNN[i] = nonzero_NN[k-1]  # Take k-th first nonzero distance
            else:
                print ('tropygal: point', i, 'with no neighbor with D_NN>0 found.')

    f = np.exp(psi(k))/( (M-1)*V_d(dim)*dist_kNN**dim )

    if (correct_bias==True):
        # The fraction of the volume around particle i inside the domain:
        log_frac_vol = np.zeros(N)
        
        if (vol_correction=='cube'):
            if l_cube_over_d is None:
                l_cube_over_d = 2/np.sqrt(dim)
            l_cube = l_cube_over_d * dist_kNN
            for j in range(dim):
                if (data_base is not None):
                    xmax = max(data_base[:, j])
                    xmin = min(data_base[:, j])
                else:
                    xmax = max(data[:, j])
                    xmin = min(data[:, j])

                y = np.minimum(np.maximum(data[:, j], xmin), xmax)

                dx_max_over_l_cube = (xmax - y) / l_cube # we only need to correct if this is < 1/2, i.e. if the volume of the cube goes beyond support
                dx_min_over_l_cube = (y - xmin) / l_cube # we only need to correct if this is < 1/2

                needs_correc = dx_max_over_l_cube < 0.5
                log_frac_vol[needs_correc] = log_frac_vol[needs_correc] + np.log(0.5 + dx_max_over_l_cube[needs_correc])

                needs_correc = dx_min_over_l_cube < 0.5
                log_frac_vol[needs_correc] = log_frac_vol[needs_correc] + np.log(0.5 + dx_min_over_l_cube[needs_correc])
            correction = np.exp(log_frac_vol)
        else:
            # raise ValueError("tropygal: vol_correction is the volume around each point assumed for the bias correction. Current possible values: 'cube', 'actions' or 'sph'")
            raise ValueError("tropygal: vol_correction is the volume around each point assumed for the bias correction. Current possible values: 'cube'.")

        return f/correction
    else:
        return f

#-----------------------------

[docs]
def entropy(data, mu=1, k=1, correct_bias=False, vol_correction='cube', l_cube_over_d=None, workers=-1):
    """
    Estimate of (Shannon/Jaynes) differential entropy
    S = - int f ln (f/mu) d^dim x.
    
    The factor mu ensures that f/mu is dimensionless.
    It also stores the density of states if the DF is a function of integrals only.
    For precise estimates, we also want all (x_1, x_2..., x_dim) to be order unit.

    S is estimated as - (1/N) * sum_i ( ln(f_i) - ln(mu_i),
    where f_i is the estimate of the DF f around point/particle/star i
    For NN (Nerest Neighbor) method:
    From e.g. Eq. (10) in Leonenko, Pronzato, Savani (2008):
    f_i = 1/[ (N-1) * exp(-psi(k)) * V_d * D^d ], where:
    N is sample size
    psi is the digamma function
    V_d is the volume of unitary hypersphere in d-dimensions
    D is the Euclidean distance to kth neighbor

    Parameters
    ----------
    data: array [N, dim]
       Data points
    mu: float number or array of size N
       It ensures the argument of ln() is dimensionless.
       If x -> = x/sigma_x, y -> y/sigma_y... -> mu = 1/(sigma_x*sigma_y...).
       It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3,
       in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively.
    k: int value
       kth nearest neighbor
    correct_bias: Boolean
       If correct for bias due to boundary effects as proposed by Charzynska & Gambin 2015
    vol_correction: string
       strategy for correction:
       - 'cube': the support is assumed to be a paralelepiped, and the volume around each point is a cube
    l_cube_over_d: float
      side of cube around each point divided by D, the distance to the k-neighbor.
      A typically good choice is a cube inscribed in the sphere, i.e. side/D = 2/sqrt(dim). If None, this is set by default
    workers: int (default: -1, meaning using all CPUs available)
       Number of CPUs to be used to parallelize the seacrh for NNs.
    
    Returns
    -------
    float
       entropy estimate -<ln(f/mu)> = - (1/N)*sum_i ln(f_i/mu_i)
    
    References
    ----------                                                                                                                            
    .. [1] N. Leonenko, L. Pronzato, V. Savani, "A class of Rényi information estimators for multidimensional densities.", Ann. Statist. 36 (5) 2153 - 2182, 2008
    """

    f = density(data, data_base=None, k=k, correct_bias=correct_bias, vol_correction=vol_correction, l_cube_over_d=l_cube_over_d, workers=workers)
    return -np.mean(np.log(f)) + np.mean(np.log(mu)) # return S = - <ln (f/mu)>

#-----------------------------

[docs]
def cross_entropy(data, data_base, mu=1, k=1, correct_bias=False, vol_correction='cube', l_cube_over_d=None, workers=-1):
    """
    Estimate of the cross entropy H = - int f0 ln (f/mu) d^dim x.
    The factor mu ensures that f/mu is dimensionless and
    it also stores the density of states if the DF is a function of integrals only.
    For precise estimates, we also want all (x_1, x_2..., x_dim) to be order unit.
    H is estimated as (1/N) * sum_i=1^N ln(f_i),
    where f_i is the estimate of the DF f based on the dist. of point i in sample 1 to its kth neighbor in sample 2.
    For NN (Nerest Neighbor) method:
    From e.g. Eq. (11) in Leonenko, Pronzato, Savani (2008):
    f_i = 1/[ M * exp(-psi(k)) * V_d * D^d ], where:
    N is size of sample 1,
    M is size of sample 2,
    psi is the digamma function,
    V_d is the volume of unitary hypersphere in d-dimensions,
    D is the Euclidean distance of particle i in sample 1 to its kth neighbor in sample 2.

    Parameters
    ----------
    data: array [N, dim]
       Data points of sample 1
    data_base: array [M, dim]
       Data points of sample 2
    mu: float number or array of size N
       It ensures the argument of ln() is dimensionless.
       If x -> = x/sigma_x, y -> y/sigma_y... -> mu = 1/(sigma_x*sigma_y...).
       It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3,
       in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively.
    k: int value
       kth nearest neighbor
    correct_bias: Boolean
       If correct for bias due to boundary effects as proposed by Charzynska & Gambin 2015
    vol_correction: string
       strategy for correction:
       - 'cube': the support is assumed to be a paralelepiped, and the volume around each point is a cube
    l_cube_over_d: float
      Side of cube around each point divided by D, the distance to the k-neighbor.
      A typically good choice is a cube inscribed in the sphere, i.e. side/D = 2/sqrt(dim). If None, this is set by default
    workers: int (default: -1, meaning using all CPUs available).
       Number of CPUs to be used to parallelize the seacrh for NNs.

    Returns
    -------
    float
       cross entropy estimate -<ln(f/mu)> = - (1/N)*sum_i=1^N ln(f_i/mu_i)
    """

    f = density(data, data_base=data_base, k=k, correct_bias=correct_bias, vol_correction=vol_correction, l_cube_over_d=l_cube_over_d, workers=workers)
    return -np.mean(np.log(f)) + np.mean(np.log(mu)) # return S = - <ln (f/mu)>

#-----------------------------

[docs]
def C_k(q, k=1):
    """
    For Renyi entropy.
    Check Eq. (7) in Leonenko, Pronzato, Savani (2008)

    Parameters
    ----------
    q: float number
       q index in Rényi entropy (must be != 1)
    k: int value
       kth nearest neighbor

    Returns
    -------
    float
       C_k
    """
    return (Gamma(k)/Gamma(k+1.-q))**(1./(1.-q))

#-----------------------------

[docs]
def renyi_entropy(data, mu=1, q=2, k=1):
    """
    Estimate of Rényi entropy
    S_q = [1/(1 - q)] ln int f (f/mu)^(q-1) d^dim x, for q != 1.
    The factor mu ensures that f/mu is dimensionless.

    S_q is estimated as [1/(1-q)] ln (1/N) * sum_i=1^N (f_i/mu_i)^(q-1),
    where f_i is the estimate of the DF f around point/particle/star i
    For NN (Nerest Neighbor) method:
    From e.g. Eq. (7) in Leonenko, Pronzato, Savani (2008):
    f_i = 1/[ (N-1) * C_k * V_d * D^dim ], where
    C_k = [Gamma(k)/Gamma(k+1-q)]^[1/(1-q)]

    Parameters
    ----------
    data: array [N, dim]
       Data points
    mu: float number or array of size N
       It ensures the argument of ln() is dimensionless.
       If x' = x/sigma_x, y' = y/sigma_y... -> mu = 1/(sigma_x*sigma_y...).
       It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3, in cases where the DF
       depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively.
    q: float value
       q-parameter of the entropy; needs to be q != 1
    k: int value
       kth nearest neighbor

    Returns
    -------
    float
       entropy estimate [1/(1 - q)] ln <(f/mu)^(q-1)>
    """

    if (len(np.shape(data)) == 1):
        dim = 1
        data = np.reshape(data, (len(data), 1))
    elif (len(np.shape(data)) == 2):
        dim = np.shape(data)[1]
    else:
        raise ValueError("tropygal: Array with data should be of form [N, dim].")
        return np.nan

    if (q==1):
        raise ValueError ('tropygal: For the Rényi entropy, q needs to be != 1')
        return np.nan
    
    N = np.shape(data)[0]
    
    tree = KDTree(data, leafsize=10) # default leafsize=10
    dist, ind = tree.query(data, k=k+1, workers=-1) # workers is number of threads. -1 means all threads

    # Improve correction for points with D_NN = 0 (critical for bootstraps, where several points are repeated in the data).
    # In the new version, if D_NN=0, we consider these as same points and just consider them to be copies of the same point, with D_NN to the next neighbor with D_NN >0
    dist_kNN = dist[:,k].copy()
    idx_to_fix = np.where(dist_kNN == 0)[0]
    if len(idx_to_fix) > 0: # Number of points with zero distance (typically, if not zero, very small compared to N, unless using bootstrap samples)
        #print ('tropygal:', len(idx_to_fix),' points with zero D_NN. Taking next neighbors until D_NN>0...')
        # For points with D_NN=0, we look for the next neighbor, but there may be points repeated several times.
        # In bootstraps, the probability por a point to be picked k times is ~ e^{-1}/k!. For k=10, this is ~ 10^{-7}. So, for samples w/ N<10^7, k+15 should be enough:
        max_k = min(k+15, N)  # Query up to k+15 neighbors, but not all points
        dists_fixed, _ = tree.query(data[idx_to_fix], k=max_k+1, workers=workers)

        # Find the first nonzero distance for each affected point
        for i, dist_row in zip(idx_to_fix, dists_fixed):
            nonzero_NN = dist_row[dist_row > 0]  # Ignore self-matches (zero distances)
            if len(nonzero_NN) > 0:
                dist_kNN[i] = nonzero_NN[k-1]  # Take k-th first nonzero distance
            else:
                print ('tropygal: point', i, 'with no neighbor with D_NN>0 found.')
        
    #D = dist_kNN[idx]
    f = 1./( (N-1) * C_k(q, k) * V_d(dim) * dist_kNN**dim )

    return (1./(1-q))*np.log(np.mean((f/mu)**(q-1.)))

#-----------------------------

[docs]
def tsallis_entropy(data, mu=1, q=2, k=1):
    """
    Estimate of Tsallis entropy
    S_q = [1/(q - 1)][ 1 - int f (f/mu)^(q-1) d^dim x ], for q != 1.
    The factor mu ensures that f/mu is dimensionless.

    S_q is estimated as [1/(q - 1)] [ 1 - (1/N) * sum_i=1^N (f_i/mu_i)^(q-1) ],
    where f_i is the estimate of the DF f around point/particle/star i
    For NN (Nerest Neighbor) method:
    From e.g. Eq. (7) in Leonenko, Pronzato, Savani (2008):
    f_i = 1/[ (N-1) * C_k * V_d * D^dim ],
    where C_k = [Gamma(k)/Gamma(k+1-q)]^[1/(1-q)]

    Parameters
    ----------
    data: array [N, dim]
       Data points
    mu: float number or array of size N
       It ensures the argument of ln() is dimensionless.
       If x -> = x/sigma_x, y -> y/sigma_y... -> mu = 1/(sigma_x*sigma_y...).
       It is also the density of states, e.g. mu = g(E), mu = g(E,L) or mu = (2pi)^3,
       in cases where the DF depends only on integrals, e.g. energy, or energy and angular momentum, or actions, respectively.
    q: int value
       q-parameter of the entropy; needs to be q != 1
    k: int value
       kth nearest neighbor

    Returns
    -------
    float
       entropy estimate [1/(q - 1)] [ 1 -  <f^(q-1)> ]
    """

    if (len(np.shape(data)) == 1):
        dim = 1
        data = np.reshape(data, (len(data), 1))
    elif (len(np.shape(data)) == 2):
        dim = np.shape(data)[1]
    else:
        raise ValueError("tropygal: Array with data should be of form [N, dim].")
        return np.nan

    if (q==1):
        raise ValueError ('tropygal: For the Tsallis entropy, q needs to be != 1.')
        return np.nan
    
    N = np.shape(data)[0]
    
    tree = KDTree(data, leafsize=10) # default leafsize=10
    dist, ind = tree.query(data, k=k+1, workers=-1) # workers is number of threads. -1 means all threads

    # Improve correction for points with D_NN = 0 (critical for bootstraps, where several points are repeated in the data).
    # In the new version, if D_NN=0, we consider these as same points and just consider them to be copies of the same point, with D_NN to the next neighbor with D_NN >0
    dist_kNN = dist[:,k].copy()
    idx_to_fix = np.where(dist_kNN == 0)[0]
    if len(idx_to_fix) > 0: # Number of points with zero distance (typically, if not zero, very small compared to N, unless using bootstrap samples)
        #print ('tropygal:', len(idx_to_fix),' points with zero D_NN. Taking next neighbors until D_NN>0...')
        # For points with D_NN=0, we look for the next neighbor, but there may be points repeated several times.
        # In bootstraps, the probability por a point to be picked k times is ~ e^{-1}/k!. For k=10, this is ~ 10^{-7}. So, for samples w/ N<10^7, k+15 should be enough:
        max_k = min(k+15, N)  # Query up to k+15 neighbors, but not all points
        dists_fixed, _ = tree.query(data[idx_to_fix], k=max_k+1, workers=workers)

        # Find the first nonzero distance for each affected point
        for i, dist_row in zip(idx_to_fix, dists_fixed):
            nonzero_NN = dist_row[dist_row > 0]  # Ignore self-matches (zero distances)
            if len(nonzero_NN) > 0:
                dist_kNN[i] = nonzero_NN[k-1]  # Take k-th first nonzero distance
            else:
                print ('tropygal: point', i, 'with no neighbor with D_NN>0 found.')
        
    #D = dist_kNN[idx]
    f = 1./( (N-1) * C_k(q, k) * V_d(dim) * dist_kNN**dim )

    return (1./(q - 1))*( 1. - np.mean((f/mu)**(q-1.)))