Source code for xyz.univariate

import numpy as np
from scipy.spatial.distance import pdist



[docs]
def entropy_linear(A: np.ndarray) -> float:
    """Linear Gaussian estimate of differential (Shannon) entropy.

    Assumes the data are multivariate Gaussian. For covariance :math:`C`,
    the differential entropy in nats is:

    .. math::

        H = \\frac{1}{2} \\log \\det(C) + \\frac{M}{2} \\log(2\\pi e)

    where :math:`M` is the number of variables.

    Parameters
    ----------
    A : np.ndarray
        Multivariate data, shape ``(n_samples, n_features)`` (N×M).

    Returns
    -------
    float
        Estimated differential entropy in nats.

    Examples
    --------
    >>> import numpy as np
    >>> from xyz.univariate import entropy_linear
    >>> rng = np.random.default_rng(42)
    >>> A = rng.normal(size=(500, 3))
    >>> h = entropy_linear(A)
    >>> np.isfinite(h)
    True
    """
    C = np.cov(A.T)

    # Entropy for the multivariate Gaussian case:
    N, M = A.shape
    # e.g., Barnett PRL 2009
    e = 0.5 * np.log(np.linalg.det(C)) + 0.5 * M * np.log(2 * np.pi * np.exp(1))
    return e




[docs]
def entropy_kernel(Y: np.ndarray, r: float, metric: str = "chebyshev") -> float:
    """Kernel (step-kernel) estimate of differential entropy.

    Uses the mean log-probability of pairs within radius :math:`r` under the
    chosen distance. By default uses a step kernel with Chebyshev (max-norm)
    distance.

    Parameters
    ----------
    Y : np.ndarray
        Data, shape ``(n_samples, n_features)``.
    r : float
        Radius for the step kernel.
    metric : str, optional
        Distance metric for pairwise distances (e.g. ``"chebyshev"`` or
        ``"euclidean"``). Default is ``"chebyshev"``.

    Returns
    -------
    float
        Estimated differential entropy in nats.

    Examples
    --------
    >>> import numpy as np
    >>> from xyz.univariate import entropy_kernel
    >>> rng = np.random.default_rng(42)
    >>> Y = rng.normal(size=(500, 2))
    >>> h = entropy_kernel(Y, 0.1)
    >>> np.isfinite(h)
    True
    """
    return -np.log((pdist(Y, metric=metric) < r).mean())



# for details on Kraskov estimator
# https://lizliz.github.io/teaspoon/_modules/teaspoon/parameter_selection/MI_delay.html
# also read this for the case of mutual information and why the KSG estimator may return negative results
# https://github.com/paulbrodersen/entropy_estimators/issues/11#issuecomment-2109577671
# with the paper here
# https://arxiv.org/abs/2405.04980
# and a python implementation here too
# https://github.com/moldyn/NorMI



[docs]
def entropy_binning(Y, c, quantize, log_base: str = "nat"):
    """Binning (histogram) estimate of Shannon entropy.

    Discretizes each column into ``c`` bins and computes entropy from the
    empirical distribution. If ``quantize`` is False, data are binned by
    equal-width bins; otherwise ``Y`` is assumed already quantized.

    Parameters
    ----------
    Y : array-like
        Data matrix, shape ``(n_samples, n_features)``.
    c : int
        Number of bins per dimension.
    quantize : bool
        If True, treat ``Y`` as already quantized (values in ``0..c-1``).
        If False, bin continuous values with :func:`xyz.utils.quantize`.
    log_base : str, optional
        Logarithm base; currently only ``"nat"`` is supported.

    Returns
    -------
    float
        Estimated entropy (implementation may return from internal state).

    Examples
    --------
    Used internally by discrete estimators; for continuous entropy prefer
    :class:`xyz.KSGEntropy` or :class:`xyz.MVNEntropy`.
    """
    # TODO write version with entropy in bits

    from .utils import quantize

    if quantize:
        Yq = Y
    else:
        Yq = Y
        for j in range(Y.shape[1]):
            Yq[:, j] = quantize(Y[:, j], c) - 1
    Q: np.ndarray = Yq