Source code for lenspack.stats

# -*- coding: utf-8 -*-

"""STATS MODULE

This module contains some common statistical measures useful in weak-lensing
studies. For example, the higher-order moments of filtered convergence maps
can be used to constrain cosmological parameters.

"""

import numpy as np
from scipy.optimize import brentq
from scipy.stats import norm, gaussian_kde


[docs]def mad(x):
    """Compute median absolute deviation (MAD) of an array.

    Parameters
    ----------
    x : array_like
        Input array.

    Returns
    -------
    float
        The MAD of `x`.

    """
    return np.median(np.abs(x - np.median(x)))


[docs]def skew(x):
    """Compute the skewness of an array as the third standardized moment.

    Parameters
    ----------
    x : array_like
        Input array. If `x` is multidimensional, skewness is automatically
        computed over all elements in the array, not just along one axis.

    Returns
    -------
    float
        Skewness of `x`.

    Notes
    -----
    For array x, the calculation is carried out as E[((x - mu) / sigma)^3],
    where E() is the expectation value, mu is the mean of x, and sigma is the
    uncorrected sample standard deviation.

    This is equivalent to
        >>> n = len(x)
        >>> numer = np.power(x - mu, 3).sum() / n
        >>> denom = np.power(np.power(x - mu, 2).sum() / (n - 1), 3 / 2)
        >>> skew = numer / denom
    and to the normalized 3rd-order central moment
        >>> mu_n(x, 3, normed=True)

    This function's output matches that of scipy.stats.skew exactly, but
    the latter is typically faster.

    """
    mean = x.mean(axis=None)
    sigma = x.std(ddof=0, axis=None)
    sigma = 1 if sigma <= 0 else sigma
    return np.mean(np.power((x - mean) / sigma, 3), axis=None)


[docs]def kurt(x, fisher=True):
    """Compute the kurtosis of an array as the fourth standardized moment.

    Parameters
    ----------
    x : array_like
        Input array. If `x` is multidimensional, kurtosis is automatically
        computed over all elements in the array, not just along one axis.
    fisher : bool, optional
        If True, use Fisher's normalization, i.e. subtract 3 from the result.

    Returns
    -------
    float
        Kurtosis of `x`.

    Notes
    -----
    For array x, the calculation is carried out as E[((x - mu) / sigma)^4],
    where E() is the expectation value, mu is the mean of x, and sigma is the
    uncorrected sample standard deviation.

    This is equivalent to the normalized 4th-order central moment
        >>> mu_n(x, 4, normed=True) - 3

    This function's output matches that of scipy.stats.kurtosis very well, but
    the latter is typically faster.

    """
    mean = x.mean(axis=None)
    sigma = x.std(ddof=0, axis=None)
    sigma = 1 if sigma <= 0 else sigma
    return np.mean(np.power((x - mean) / sigma, 4), axis=None) - [0, 3][fisher]


[docs]def mu_n(x, order, normed=False):
    """Compute the (normalized) nth-order central moment of a distribution.

    Parameters
    ----------
    x : array_like
        Input array. If `x` is multidimensional, the moment is automatically
        computed over all elements in the array, not just along one axis.
    order : int (positive)
        Order of the moment.
    normed : bool, optional
        If True, normalize the result by sigma^order, where sigma is the
        corrected sample standard deviation of `x`. Default is False.

    Returns
    -------
    float
        Nth-order moment of `x`.

    Notes
    -----
    This function's output matches that of scipy.stats.moment very well, but
    the latter is typically faster.

    """
    # Check order
    order = int(order)
    assert order > 0, "Only n > 0 supported."

    x = np.atleast_1d(x).flatten()
    mean = x.mean()

    if normed and order > 2:
        sigma = x.std(ddof=1)
        result = np.power((x - mean) / sigma, order).mean()
    else:
        result = np.power(x - mean, order).mean()

    return result


[docs]def kappa_n(x, order):
    """Compute the nth-order cumulant of a distribution.

    Parameters
    ----------
    x : array_like
        Input array. If `x` is multidimensional, the cumulant is automatically
        computed over all elements in the array, not just along one axis.
    order : int between 2 and 6, inclusive
        Order of the cumulant.

    Returns
    -------
    float
        Nth-order cumulant of `x`.

    Notes
    -----
    This function's output matches that of scipy.stats.kstat very well, but
    the latter is typically faster.

    """
    # Check order
    order = int(order)
    assert order in range(2, 7), "Order {} not supported.".format(order)

    if order == 2:
        kappa = mu_n(x, 2, normed=False)
    if order == 3:
        kappa = mu_n(x, 3, normed=False)
    if order == 4:
        mu2 = mu_n(x, 2, normed=False)
        mu4 = mu_n(x, 4, normed=False)
        kappa = mu4 - 3 * mu2**2
    if order == 5:
        mu2 = mu_n(x, 2, normed=False)
        mu3 = mu_n(x, 3, normed=False)
        mu5 = mu_n(x, 5, normed=False)
        kappa = mu5 - 10 * mu3 * mu2
    if order == 6:
        mu2 = mu_n(x, 2, normed=False)
        mu3 = mu_n(x, 3, normed=False)
        mu4 = mu_n(x, 4, normed=False)
        mu6 = mu_n(x, 6, normed=False)
        kappa = mu6 - 15 * mu4 * mu2 - 10 * mu3**2 + 30 * mu2**3

    return kappa


[docs]def fdr(x, tail, alpha=0.05, kde=False, n_samples=10000, debug=False):
    """Compute the false discovery rate (FDR) threshold of a distribution.

    Parameters
    ----------
    x : array_like
        Samples of the distribution. If `x` is multidimentional, it will
        first be flattened.
    tail : {'left', 'right'}
        Side of the distribution for which to compute the threshold.
    alpha : float, optional
        Maximum average false discovery rate. Default is 0.05.
    kde : bool, optional
        If True, compute p-values from the distribution smoothed by kernel
        density estimation. Not recommended for large `x`. Default is False.
    n_samples : int, optional
        Number of samples to draw if using KDE. Default is 10000.
    debug : bool, optional
        If True, print the number of detections and the final p-value.
        Default is False.

    Returns
    -------
    float
        FDR threshold.

    Examples
    --------
    ...

    """
    # Check inputs
    assert tail in ('left', 'right'), "Invalid tail."

    # Basic measures
    x = np.atleast_1d(x).flatten()
    N = len(x)
    mean = x.mean()
    sigma = x.std(ddof=1)

    if kde:
        # Approximate the distribution by kernel density estimation
        pdf = gaussian_kde(x, bw_method='silverman')
        amin, amax = x.min(), x.max()
        vmax = np.sign(amax) * np.abs(amax + sigma)
        vmin = np.sign(amin) * np.abs(amin - sigma)

        # Cumulative distribution function
        def cdf(z, tail):
            if tail == 'right':
                return pdf.integrate_box(-np.inf, z)
            else:
                return pdf.integrate_box(z, np.inf)

        # Inverse cumulative distribution function
        def cdfinv(t, tail):
            assert t > 0 and t < 1, "Input to cdfinv must be in (0, 1)."
            try:
                result = brentq(lambda x: cdf(x, tail) - t, vmin, vmax)
            except ValueError:
                print("Warning: bad value. Returning 0.")
                print("threshold = {}".format(t))
                print("vmin = {}".format(vmin))
                print("vmax = {}".format(vmax))
                result = 0
            return result

        # Compute p-values
        if tail == 'right':
            smin = mean + sigma
            smax = vmax
        else:
            smax = mean - sigma
            smin = vmin
        samples = (smax - smin) * np.random.random(n_samples) + smin
        pvals = 1 - np.array([cdf(s, tail) for s in samples])
    else:
        # Compute p-values assuming Gaussian null hypothesis
        z = (x - mean) / sigma
        if tail == 'right':
            pvals = 1 - norm.cdf(z)
        else:
            pvals = norm.cdf(z)

    # Sort p-values and find the last one
    inds = np.argsort(pvals)
    ialpha = alpha * np.arange(1, len(pvals) + 1) / float(len(pvals))
    diff = pvals[inds] - ialpha
    lastindex = np.argmax((diff < 0).cumsum())
    pval = pvals[inds][lastindex]
    if debug:
        print("{} detection(s) / {}".format(lastindex + 1, len(pvals)))
        print("last pval = {}".format(pval))

    # Invert pval to get threshold
    if kde:
        tval = cdfinv(1 - pval, tail)
    else:
        tval = x[inds][lastindex]

    return tval


[docs]def hc(x, kind=1):
    """Compute a higher criticism statistic of a distribution.

    Parameters
    ----------
    x : array_like
        Samples of the distribution. If `x` is multidimentional, it will
        first be flattened.
    kind : int, optional
        Either 1 (HC^*) or 2 (HC^+). Default is 1.

    Returns
    -------
    float
        Higher criticism of the first (*) or second (+) kind.

    References
    ----------
    * Donoho & Jin, The Annals of Statistics 32, 962 (2004)
    * Pires, Starck, Amara, et al., A&A 505, 969 (2009)

    Examples
    --------
    ...

    """
    # Check inputs
    assert kind in (1, 2), "Invalid kind. Must be either 1 or 2."

    # Basic measures
    x = np.atleast_1d(x).flatten()
    mean = x.mean()
    sigma = x.std(ddof=1)

    # Normalize values
    z = (x - mean) / sigma

    # Compute and sort p-values
    pvals = 1 - norm.cdf(z)
    pvals = np.sort(pvals)

    # Compute HC value
    denom2 = pvals - np.power(pvals, 2)
    good_inds = denom2 > 0  # Avoid division by zero and complex numbers
    pvals = pvals[good_inds]
    n = len(pvals)
    numer = np.sqrt(n) * (np.arange(1, n + 1) / float(n) - pvals)
    denom = np.sqrt(denom2[good_inds])
    ratio = numer / denom

    if kind == 2:
        ninv = 1. / n
        sel = (pvals >= ninv) & (pvals <= (1. - ninv))
        ratio = ratio[sel]

    return np.abs(ratio).max()