Source code for xyz.univariate

import numpy as np
from scipy.spatial.distance import pdist


[docs] def entropy_linear(A: np.ndarray) -> float: """Linear Gaussian estimate of differential (Shannon) entropy. Assumes the data are multivariate Gaussian. For covariance :math:`C`, the differential entropy in nats is: .. math:: H = \\frac{1}{2} \\log \\det(C) + \\frac{M}{2} \\log(2\\pi e) where :math:`M` is the number of variables. Parameters ---------- A : np.ndarray Multivariate data, shape ``(n_samples, n_features)`` (N×M). Returns ------- float Estimated differential entropy in nats. Examples -------- >>> import numpy as np >>> from xyz.univariate import entropy_linear >>> rng = np.random.default_rng(42) >>> A = rng.normal(size=(500, 3)) >>> h = entropy_linear(A) >>> np.isfinite(h) True """ C = np.cov(A.T) # Entropy for the multivariate Gaussian case: N, M = A.shape # e.g., Barnett PRL 2009 e = 0.5 * np.log(np.linalg.det(C)) + 0.5 * M * np.log(2 * np.pi * np.exp(1)) return e
[docs] def entropy_kernel(Y: np.ndarray, r: float, metric: str = "chebyshev") -> float: """Kernel (step-kernel) estimate of differential entropy. Uses the mean log-probability of pairs within radius :math:`r` under the chosen distance. By default uses a step kernel with Chebyshev (max-norm) distance. Parameters ---------- Y : np.ndarray Data, shape ``(n_samples, n_features)``. r : float Radius for the step kernel. metric : str, optional Distance metric for pairwise distances (e.g. ``"chebyshev"`` or ``"euclidean"``). Default is ``"chebyshev"``. Returns ------- float Estimated differential entropy in nats. Examples -------- >>> import numpy as np >>> from xyz.univariate import entropy_kernel >>> rng = np.random.default_rng(42) >>> Y = rng.normal(size=(500, 2)) >>> h = entropy_kernel(Y, 0.1) >>> np.isfinite(h) True """ return -np.log((pdist(Y, metric=metric) < r).mean())
# for details on Kraskov estimator # https://lizliz.github.io/teaspoon/_modules/teaspoon/parameter_selection/MI_delay.html # also read this for the case of mutual information and why the KSG estimator may return negative results # https://github.com/paulbrodersen/entropy_estimators/issues/11#issuecomment-2109577671 # with the paper here # https://arxiv.org/abs/2405.04980 # and a python implementation here too # https://github.com/moldyn/NorMI
[docs] def entropy_binning(Y, c, quantize, log_base: str = "nat"): """Binning (histogram) estimate of Shannon entropy. Discretizes each column into ``c`` bins and computes entropy from the empirical distribution. If ``quantize`` is False, data are binned by equal-width bins; otherwise ``Y`` is assumed already quantized. Parameters ---------- Y : array-like Data matrix, shape ``(n_samples, n_features)``. c : int Number of bins per dimension. quantize : bool If True, treat ``Y`` as already quantized (values in ``0..c-1``). If False, bin continuous values with :func:`xyz.utils.quantize`. log_base : str, optional Logarithm base; currently only ``"nat"`` is supported. Returns ------- float Estimated entropy (implementation may return from internal state). Examples -------- Used internally by discrete estimators; for continuous entropy prefer :class:`xyz.KSGEntropy` or :class:`xyz.MVNEntropy`. """ # TODO write version with entropy in bits from .utils import quantize if quantize: Yq = Y else: Yq = Y for j in range(Y.shape[1]): Yq[:, j] = quantize(Y[:, j], c) - 1 Q: np.ndarray = Yq