# diststats.py - Distance distribution descriptors
# -----------------------------------------------
# This file is a part of DeerLab. License is MIT (see LICENSE.md).
# Copyright(c) 2019-2023: Luis Fabregas, Stefan Stoll and other contributors.
import numpy as np
import warnings
import copy
from scipy.signal import find_peaks
from scipy.integrate import cumulative_trapezoid as cumtrapz
[docs]
def diststats(r, P, Puq=None, verbose=False, threshold=None):
r"""
Computes statistical quantities for the location, spread, and shape of
a distance distribution, with or without their corresponding uncertainties.
This function calculates various statistical quantities of a distance distribution,
such as its mean, median, mode, etc. It can also calculate the uncertainty
of these quantities using moment-based or bootstrapped uncertainty quantification, if provided.
The function allows to specify whether to print a summary of the results and the
peak detection threshold for the calculation of the mode of the distribution.
Parameters
----------
r : array_like
Distance axis, in nm. The distances in the distribution.
P : array_like
Distance distribution, does not have to be normalized. The probability or
relative frequency of the distances in `r`.
Puq : :ref:`UQResult`, optional
Uncertainty quantification of the distance distribution. If Puq is not given, a
single output is returned without any uncertainty estimation. If given, two outputs
are returned containing the uncertainty estimation.
verbose : boolean, optional
Print a summary of all statistical quantities (and their uncertainties if calculated).
threshold : float, optional
Peak detection threshold for the calculation of modes of a distribution. The default is ``max(P)/10``.
Returns
-------
estimators : dict
Dictionary of shape, location, and spread descriptors of the input distance distribution:
General parameters
* ``'rmin'`` - Minimum distance in the distribution range, in nm
* ``'rmax'`` - Maximum distance in the distribution range, in nm
* ``'int'`` - Integral of the distance distribution
Location parameters
* ``'mean'`` or ``'moment1'`` - Mean distance, in nm (see `more <https://en.wikipedia.org/wiki/Mean>`__)
* ``'median'`` - Median distance, in nm (see `more <https://en.wikipedia.org/wiki/Median>`__)
* ``'iqm'`` - Interquartile mean (IQM) distance, in nm (see `more <https://en.wikipedia.org/wiki/Interquartile_mean>`__)
* ``'mode'`` - First modal distance, in nm (see `more <https://en.wikipedia.org/wiki/Mode_(statistics)>`__)
* ``'modes'`` - All modal distances, in nm (see `more <https://en.wikipedia.org/wiki/Mode_(statistics)>`__)
Spread parameters
* ``'iqr'`` - Interquartile range, in nm (see `more <https://en.wikipedia.org/wiki/Interquartile_range>`__)
* ``'mad'`` - Mean absolute deviation (MAD), in nm (see `more <https://en.wikipedia.org/wiki/Average_absolute_deviation>`__)
* ``'std'`` - Standard deviation, in nm (see `more <https://en.wikipedia.org/wiki/Standard_deviation>`__)
* ``'var'`` or ``'moment2'`` - Variance, in nm² (see `more <https://en.wikipedia.org/wiki/Variance>`__)
* ``'entropy'`` - Shannon entropy, in nat (see `more <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`__)
Shape parameters
* ``'modality'`` - Modality (number of modes)
* ``'skewness'`` or ``'moment3'`` - Skewness (see `more <https://en.wikipedia.org/wiki/Skewness>`__)
* ``'kurtosis'`` - Excess kurtosis (see `more <https://en.wikipedia.org/wiki/Kurtosis>`__)
* ``'moment4'`` - 4th moment (kurtosis) (see `more <https://en.wikipedia.org/wiki/Kurtosis>`__)
uq : dict of :ref:`UQResult`
Dictionary of the parameters moment-based uncertainty quantification.
See above for the dictionary keys. Only calculated if ``Puq`` is provided.
Notes
-----
For the ``'mode'``, ``'modes'`` and ``'modality'`` parameters, moment-based uncertainties are not
available. Uncertainties can, however, be calculated via bootsrapping of these quantities, e.g. ::
def analyze_rmode(V):
fit = dl.fitmodel(V,t,r)
rmode = dl.diststats(fit.P,r)[0]['mode']
return rmode
# Bootstrap analysis of distance mode
rmode_uq = dl.bootstrap_analysis(V,Vfit,analyze_rmode)
"""
P,r = np.atleast_1d(P,r)
# Check to avoid non-sensical results with syntax diststats(P,r)
if not np.all(np.diff(r)>0):
raise KeyError('The distance axis (1st argument) must be a monotnously increasing vector.')
if threshold is None:
threshold = np.max(P)/10
# Auxiliary functions
# -------------------
int = np.trapz(P,r) if not np.all(P==0) else 1
def normalize(P):
return P/int
# Percentile function
def pctile(r,P,p):
cdf = cumtrapz(normalize(P),r,initial=0)
cdf, index = np.unique(cdf,return_index=True)
rpctile = np.interp(p/100,cdf,r[index])
return rpctile
# Expectation operator function
def E(x,P,r):
return np.trapz(x*normalize(P),r)
# Location estimators
# -------------------
# 1st moment - Mean
meanfcn = lambda P: E(r,P,r)
# Median
medianfcn = lambda P: pctile(r,P,50)
# Interquartile mean
def iqmfcn(P):
IQrange = (r>pctile(r,P,25)) & (r<pctile(r,P,75))
return E(r[IQrange],P[IQrange]/np.trapz(normalize(P)[IQrange],r[IQrange]),r[IQrange])
# Mode
modefcn = lambda P: r[np.argmax(P)]
# Modes
modesfcn = lambda P: r[find_peaks(P,height=threshold)[0]]
# Spread estimators
# -----------------
# Interquartile range
iqrfcn = lambda P: pctile(r,P,75) - pctile(r,P,25)
# Mean absolute deviation
madfcn = lambda P: E(abs(r - meanfcn(P)),P,r)
# 2nd moment - Variance
variancefcn = lambda P: E((r - meanfcn(P))**2,P,r)
# 2nd moment - Standard deviation
stdfcn = lambda P: np.sqrt(variancefcn(P))
# Entropy (information theory)
entropyfcn = lambda P: -E(np.log(np.maximum(np.finfo(float).eps,normalize(P))),P,r)
# Shape estimators
# ----------------
# Modality
modalityfcn = lambda P: np.size(modesfcn(P))
# 3rd moment - Skewness
skewnessfcn = lambda P: E(((r - meanfcn(P))/stdfcn(P))**3,P,r)
# 4th moment - Kurtosis
kurtosisfcn = lambda P: E(((r - meanfcn(P))/stdfcn(P))**4,P,r)
# Excess kurtosis
exkurtosisfcn = lambda P: 3 - E(((r - meanfcn(P))/stdfcn(P))**4,P,r)
# 0th moment - Integral
intfcn = lambda P: np.trapz(P,r)
# Calculate distribution estimators
estimators = {
'rmin': min(r),
'rmax': max(r),
'int': intfcn(P),
'mean': meanfcn(P),
'median': medianfcn(P),
'mode': modefcn(P),
'modes': modesfcn(P),
'iqm': iqmfcn(P),
'mad': madfcn(P),
'var': variancefcn(P),
'std': stdfcn(P),
'iqr': iqrfcn(P),
'entropy': entropyfcn(P),
'modality': modalityfcn(P),
'skewness': skewnessfcn(P),
'kurtosis': exkurtosisfcn(P),
'moment1': meanfcn(P),
'moment2': variancefcn(P),
'moment3': skewnessfcn(P),
'moment4': kurtosisfcn(P),
}
# Calculate distribution estimator uncertainties if uncertainty of P is given
uq = None
if Puq is not None:
uq = {
'rmin': None,
'rmax': None,
'int': None,
'mean': _propagation(Puq,meanfcn),
'median': _propagation(Puq,medianfcn),
'mode': None,
'modes': None,
'iqm': _propagation(Puq,iqmfcn),
'mad': _propagation(Puq,madfcn),
'var': _propagation(Puq,variancefcn),
'std': _propagation(Puq,stdfcn),
'iqr': _propagation(Puq,iqrfcn),
'entropy': _propagation(Puq,entropyfcn),
'modality': None,
'skewness': _propagation(Puq,skewnessfcn),
'kurtosis': _propagation(Puq,exkurtosisfcn),
'moment1': _propagation(Puq,meanfcn),
'moment2': _propagation(Puq,variancefcn),
'moment3': _propagation(Puq,skewnessfcn),
'moment4': _propagation(Puq,kurtosisfcn),
}
# Print if requested
if verbose:
_print_estimators(r,estimators,uq)
return estimators,uq
def _propagation(Puq,fcn):
# Propagate the uncertainty to the function
with warnings.catch_warnings():
# Ignore warnings when evaluating Jacobians outside of reasonable bounds
warnings.simplefilter('ignore')
uq_ = Puq.propagate(fcn)
uq = copy.deepcopy(uq_)
def ci(p):
paramci = np.atleast_2d(uq_.ci(p))
return [paramci[:,0][0],paramci[:,1][0]]
# Wrap the ci() method to simplify array
uq.ci = ci
return uq
def _print_estimators(r,estimators,uq):
# Print summary of estimators if requested
if uq is None:
print('-------------------------------------------------')
print('Distribution Statistics')
print('-------------------------------------------------')
print(f'Range {min(r):.2f}-{max(r):.2f} nm')
print(f'Integral {estimators["int"]:.2f}')
print('-------------------------------------------------')
print('Location')
print('-------------------------------------------------')
print(f'Mean {estimators["mean"]:.2f} nm')
print(f'Median {estimators["median"]:.2f} nm')
print(f'Interquartile mean {estimators["iqm"]:.2f} nm')
print(f'Mode {estimators["mode"]:.2f} nm')
print('-------------------------------------------------')
print('Spread')
print('-------------------------------------------------')
print(f'Standard deviation {estimators["std"]:.2f} nm')
print(f'Mean absolute deviation {estimators["mad"]:.2f} nm')
print(f'Interquartile range {estimators["iqr"]:.2f} nm')
print(f'Variance {estimators["var"]:.2f} nm²')
print('-------------------------------------------------')
print('Shape')
print('-------------------------------------------------')
print(f'Modality {estimators["modality"]} ')
print(f'Skewness {estimators["skewness"]:.2f} ')
print(f'Excess kurtosis {estimators["kurtosis"]:.2f} ')
print('-------------------------------------------------')
else:
print('-------------------------------------------------')
print('Distribution Statistics')
print('-------------------------------------------------')
print(f'Range {min(r):.2f}-{max(r):.2f} nm')
print(f'Integral {estimators["int"]:.2f}')
print('-------------------------------------------------')
print('Location')
print('-------------------------------------------------')
print(f'Range {min(r):.2f}-{max(r):.2f} nm')
print(f'Mean {estimators["mean"]:.2f} ({uq["mean"].ci(95)[0]:.2f},{uq["mean"].ci(95)[1]:.2f}) nm')
print(f'Median {estimators["median"]:.2f} ({uq["median"].ci(95)[0]:.2f},{uq["median"].ci(95)[1]:.2f}) nm')
print(f'Interquartile mean {estimators["iqm"]:.2f} ({uq["iqm"].ci(95)[0]:.2f},{uq["iqm"].ci(95)[1]:.2f}) nm')
print(f'Mode {estimators["mode"]:.2f} nm')
print('-------------------------------------------------')
print('Spread')
print('-------------------------------------------------')
print(f'Standard deviation {estimators["std"]:.2f} ({uq["std"].ci(95)[0]:.2f},{uq["std"].ci(95)[1]:.2f}) nm')
print(f'Mean absolute deviation {estimators["mad"]:.2f} ({uq["mad"].ci(95)[0]:.2f},{uq["mad"].ci(95)[1]:.2f}) nm')
print(f'Interquartile range {estimators["iqr"]:.2f} ({uq["iqr"].ci(95)[0]:.2f},{uq["iqr"].ci(95)[1]:.2f}) nm')
print(f'Variance {estimators["var"]:.2f} ({uq["var"].ci(95)[0]:.2f},{uq["var"].ci(95)[1]:.2f}) nm²')
print('-------------------------------------------------')
print('Shape')
print('-------------------------------------------------')
print(f'Modality {estimators["modality"]}')
print(f'Skewness {estimators["skewness"]:.2f} ({uq["skewness"].ci(95)[0]:.2f},{uq["skewness"].ci(95)[1]:.2f}) ')
print(f'Kurtosis {estimators["kurtosis"]:.2f} ({uq["kurtosis"].ci(95)[0]:.2f},{uq["kurtosis"].ci(95)[1]:.2f}) ')
print('-------------------------------------------------')