Source code for lenspack.stats

# -*- coding: utf-8 -*-

"""STATS MODULE

This module contains some common statistical measures useful in weak-lensing
studies. For example, the higher-order moments of filtered convergence maps
can be used to constrain cosmological parameters.

"""

import numpy as np
from scipy.optimize import brentq
from scipy.stats import norm, gaussian_kde


[docs]def mad(x): """Compute median absolute deviation (MAD) of an array. Parameters ---------- x : array_like Input array. Returns ------- float The MAD of `x`. """ return np.median(np.abs(x - np.median(x)))
[docs]def skew(x): """Compute the skewness of an array as the third standardized moment. Parameters ---------- x : array_like Input array. If `x` is multidimensional, skewness is automatically computed over all elements in the array, not just along one axis. Returns ------- float Skewness of `x`. Notes ----- For array x, the calculation is carried out as E[((x - mu) / sigma)^3], where E() is the expectation value, mu is the mean of x, and sigma is the uncorrected sample standard deviation. This is equivalent to >>> n = len(x) >>> numer = np.power(x - mu, 3).sum() / n >>> denom = np.power(np.power(x - mu, 2).sum() / (n - 1), 3 / 2) >>> skew = numer / denom and to the normalized 3rd-order central moment >>> mu_n(x, 3, normed=True) This function's output matches that of scipy.stats.skew exactly, but the latter is typically faster. """ mean = x.mean(axis=None) sigma = x.std(ddof=0, axis=None) sigma = 1 if sigma <= 0 else sigma return np.mean(np.power((x - mean) / sigma, 3), axis=None)
[docs]def kurt(x, fisher=True): """Compute the kurtosis of an array as the fourth standardized moment. Parameters ---------- x : array_like Input array. If `x` is multidimensional, kurtosis is automatically computed over all elements in the array, not just along one axis. fisher : bool, optional If True, use Fisher's normalization, i.e. subtract 3 from the result. Returns ------- float Kurtosis of `x`. Notes ----- For array x, the calculation is carried out as E[((x - mu) / sigma)^4], where E() is the expectation value, mu is the mean of x, and sigma is the uncorrected sample standard deviation. This is equivalent to the normalized 4th-order central moment >>> mu_n(x, 4, normed=True) - 3 This function's output matches that of scipy.stats.kurtosis very well, but the latter is typically faster. """ mean = x.mean(axis=None) sigma = x.std(ddof=0, axis=None) sigma = 1 if sigma <= 0 else sigma return np.mean(np.power((x - mean) / sigma, 4), axis=None) - [0, 3][fisher]
[docs]def mu_n(x, order, normed=False): """Compute the (normalized) nth-order central moment of a distribution. Parameters ---------- x : array_like Input array. If `x` is multidimensional, the moment is automatically computed over all elements in the array, not just along one axis. order : int (positive) Order of the moment. normed : bool, optional If True, normalize the result by sigma^order, where sigma is the corrected sample standard deviation of `x`. Default is False. Returns ------- float Nth-order moment of `x`. Notes ----- This function's output matches that of scipy.stats.moment very well, but the latter is typically faster. """ # Check order order = int(order) assert order > 0, "Only n > 0 supported." x = np.atleast_1d(x).flatten() mean = x.mean() if normed and order > 2: sigma = x.std(ddof=1) result = np.power((x - mean) / sigma, order).mean() else: result = np.power(x - mean, order).mean() return result
[docs]def kappa_n(x, order): """Compute the nth-order cumulant of a distribution. Parameters ---------- x : array_like Input array. If `x` is multidimensional, the cumulant is automatically computed over all elements in the array, not just along one axis. order : int between 2 and 6, inclusive Order of the cumulant. Returns ------- float Nth-order cumulant of `x`. Notes ----- This function's output matches that of scipy.stats.kstat very well, but the latter is typically faster. """ # Check order order = int(order) assert order in range(2, 7), "Order {} not supported.".format(order) if order == 2: kappa = mu_n(x, 2, normed=False) if order == 3: kappa = mu_n(x, 3, normed=False) if order == 4: mu2 = mu_n(x, 2, normed=False) mu4 = mu_n(x, 4, normed=False) kappa = mu4 - 3 * mu2**2 if order == 5: mu2 = mu_n(x, 2, normed=False) mu3 = mu_n(x, 3, normed=False) mu5 = mu_n(x, 5, normed=False) kappa = mu5 - 10 * mu3 * mu2 if order == 6: mu2 = mu_n(x, 2, normed=False) mu3 = mu_n(x, 3, normed=False) mu4 = mu_n(x, 4, normed=False) mu6 = mu_n(x, 6, normed=False) kappa = mu6 - 15 * mu4 * mu2 - 10 * mu3**2 + 30 * mu2**3 return kappa
[docs]def fdr(x, tail, alpha=0.05, kde=False, n_samples=10000, debug=False): """Compute the false discovery rate (FDR) threshold of a distribution. Parameters ---------- x : array_like Samples of the distribution. If `x` is multidimentional, it will first be flattened. tail : {'left', 'right'} Side of the distribution for which to compute the threshold. alpha : float, optional Maximum average false discovery rate. Default is 0.05. kde : bool, optional If True, compute p-values from the distribution smoothed by kernel density estimation. Not recommended for large `x`. Default is False. n_samples : int, optional Number of samples to draw if using KDE. Default is 10000. debug : bool, optional If True, print the number of detections and the final p-value. Default is False. Returns ------- float FDR threshold. Examples -------- ... """ # Check inputs assert tail in ('left', 'right'), "Invalid tail." # Basic measures x = np.atleast_1d(x).flatten() N = len(x) mean = x.mean() sigma = x.std(ddof=1) if kde: # Approximate the distribution by kernel density estimation pdf = gaussian_kde(x, bw_method='silverman') amin, amax = x.min(), x.max() vmax = np.sign(amax) * np.abs(amax + sigma) vmin = np.sign(amin) * np.abs(amin - sigma) # Cumulative distribution function def cdf(z, tail): if tail == 'right': return pdf.integrate_box(-np.inf, z) else: return pdf.integrate_box(z, np.inf) # Inverse cumulative distribution function def cdfinv(t, tail): assert t > 0 and t < 1, "Input to cdfinv must be in (0, 1)." try: result = brentq(lambda x: cdf(x, tail) - t, vmin, vmax) except ValueError: print("Warning: bad value. Returning 0.") print("threshold = {}".format(t)) print("vmin = {}".format(vmin)) print("vmax = {}".format(vmax)) result = 0 return result # Compute p-values if tail == 'right': smin = mean + sigma smax = vmax else: smax = mean - sigma smin = vmin samples = (smax - smin) * np.random.random(n_samples) + smin pvals = 1 - np.array([cdf(s, tail) for s in samples]) else: # Compute p-values assuming Gaussian null hypothesis z = (x - mean) / sigma if tail == 'right': pvals = 1 - norm.cdf(z) else: pvals = norm.cdf(z) # Sort p-values and find the last one inds = np.argsort(pvals) ialpha = alpha * np.arange(1, len(pvals) + 1) / float(len(pvals)) diff = pvals[inds] - ialpha lastindex = np.argmax((diff < 0).cumsum()) pval = pvals[inds][lastindex] if debug: print("{} detection(s) / {}".format(lastindex + 1, len(pvals))) print("last pval = {}".format(pval)) # Invert pval to get threshold if kde: tval = cdfinv(1 - pval, tail) else: tval = x[inds][lastindex] return tval
[docs]def hc(x, kind=1): """Compute a higher criticism statistic of a distribution. Parameters ---------- x : array_like Samples of the distribution. If `x` is multidimentional, it will first be flattened. kind : int, optional Either 1 (HC^*) or 2 (HC^+). Default is 1. Returns ------- float Higher criticism of the first (*) or second (+) kind. References ---------- * Donoho & Jin, The Annals of Statistics 32, 962 (2004) * Pires, Starck, Amara, et al., A&A 505, 969 (2009) Examples -------- ... """ # Check inputs assert kind in (1, 2), "Invalid kind. Must be either 1 or 2." # Basic measures x = np.atleast_1d(x).flatten() mean = x.mean() sigma = x.std(ddof=1) # Normalize values z = (x - mean) / sigma # Compute and sort p-values pvals = 1 - norm.cdf(z) pvals = np.sort(pvals) # Compute HC value denom2 = pvals - np.power(pvals, 2) good_inds = denom2 > 0 # Avoid division by zero and complex numbers pvals = pvals[good_inds] n = len(pvals) numer = np.sqrt(n) * (np.arange(1, n + 1) / float(n) - pvals) denom = np.sqrt(denom2[good_inds]) ratio = numer / denom if kind == 2: ninv = 1. / n sel = (pvals >= ninv) & (pvals <= (1. - ninv)) ratio = ratio[sel] return np.abs(ratio).max()