Source code for deerlab.bootstrap_analysis

# bootstrap_analysis.py - Bootstrap analysis for uncertainty estimation
# --------------------------------------------------------------------------
# This file is a part of DeerLab. License is MIT (see LICENSE.md).
# Copyright(c) 2019-2023: Luis Fabregas, Stefan Stoll and other contributors.

import numpy as np
import types
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from deerlab.classes import UQResult
from deerlab.utils import isnumeric
from deerlab.noiselevel import noiselevel


[docs]
def bootstrap_analysis(fcn,Vexp,Vfit, samples=1000, noiselvl=None, resampling='gaussian', verbose = False, cores=1, memorylimit=8):
    r""" 
    Bootstrap analysis for uncertainty quantification

    This function creates new synthetic datasets by using the noiseless model
    prediction ``Vfit`` and adding noise, according to the specified method. 
    The ``fcn`` function is then applied to each of the new datasets to calculate
    the uncertainty of the variables returned by ``fcn``.

    Parameters
    ----------
    fcn : callable
        Function to be analyzed. Must be a callable function ``fcn(V)`` accepting
        a dataset array as input and returning a tuple with all variables to be analyzed.
        All variables must be numerical arrays (no strings or booleans) and 
        must preserve shape between calls.

    Vexp : array_like or list of array_like
        Experimental dataset(s).

    Vfit : array or list of array_like
        Fit of the dataset(s).

    samples : scalar, optional
        Number of bootstrap samples to analyze. The quality of bootstrapping 
        results improve with the number of boostrap samples evaluated, the 
        default is 1000.

    noiselvl : scalar or list thereof
        Noise level of the input dataset(s), specified as standard deviations.
        If not specified, these are automatically estimated from the experimental
        dataset(s). 

    resampling : string, optional
        Specifies the method employed for re-sampling new bootstrap samples.

        * ``'gaussian'`` - Sample noise from a Gaussian distribution.
        * ``'residual'`` - Sample noise from the fit residuals.
        
        The default is ``'gaussian'``.

    cores : scalar, optional
        Number of CPU cores/processes for parallel computing. If ``cores=1`` no parallel 
        computing is used. If ``cores=-1`` all available CPUs are used. The default is 
        one core (no parallelization).

    verbose : boolean, optional
        Specifies whether to print the progress of the bootstrap analysis on the 
        command window, the default is false.

    memorylimit : 
        Memory limit to be allocated for the full boostrap analysis. If the requested analysis exceeds this limit, the 
        execution will be stopped. The default is 12GB.  

    Returns
    -------
    bootuq : :ref:`UQResult` or list of :ref:`UQResult`
        Bootstrap uncertainty quantification for each variable returned by ``fcn``. 


    Examples
    --------
    To analyze several variables during the same run, the function must return tuple 
    containing all of them::

        def myfcn(V):
            Pfit1 = dl.nlls(V,dl.dd_gauss,r,K)
            Pfit2 = dl.nlls(V,dl.dd_randcoil,r,K)
            return Pfit1,Pfit2

        bootuq = bootstrap_analysis(myfcn,V,Vfit)
        Pfit1_uq = bootuq[0]
        Pfit2_uq = bootuq[1]


    """

    # Catch invalid sample sizes
    if samples<2:
        raise ValueError('At least two bootstrap samples are required.')
    nSamples = int(samples)

    # Validation of input signals
    Vexp,Vfit = ( [V] if type(V) is np.ndarray else V for V in [Vexp,Vfit])
    if len(Vexp)!=len(Vfit):
        raise KeyError('The same number of signals V and fits Vfit must be provided.')
    nSignals = len(Vexp)
    for i in range(len(Vfit)):
        if len(Vexp[i])!=len(Vfit[i]):
            raise KeyError(f'The Vexp and Vfit #{i} must have the same number of elements.')
    if type(fcn) is not types.FunctionType:
        raise KeyError('The 1st argument must be a callable function accepting dataset(s) as input.')

    # Get residuals and estimate standard deviation
    for i in range(nSignals):
        residuals = [Vfit[i] - Vexp[i] for i  in range(nSignals)]
    if noiselvl is None:
        noiselvl = [noiselevel(Vexp[i]) for i in range(nSignals)]
    noiselvl = np.atleast_1d(noiselvl)

    # Prepare bootstrap sampler (reduced I/O-stream when parallelized) 
    def sample():
        Vsample = [0]*nSignals
        for i in range(nSignals):
            #Determine requested re-sampling method
            if resampling == 'gaussian':
                # Resample from a Gaussian distribution with variance estimated from the residuals
                Vsample[i] = Vfit[i] + np.random.normal(0, noiselvl[i], len(Vfit[i]))
            elif resampling == 'residual':
                # Resample from the residual directly
                Vsample[i] =  Vfit[i] + residuals[i][np.random.permutation(len(Vfit[i]))]
            else:
                raise KeyError("Resampling method not found. Must be either 'gaussian' or 'residual'.")
        return Vsample       

    # Use original data on the first run
    varargout = fcn(Vexp)
    if type(varargout) is not tuple:
        varargout = (varargout,)
    nout = len(varargout)

    # Assert that all outputs are strictly numerical
    for var in varargout:
        if not all(isnumeric(x) for x in np.atleast_1d(var)):
            raise ValueError('Non-numeric output arguments by the analyzed function are not accepted.')
    
    # Check that the full bootstrap analysis will not exceed the memory limits
    memory_requirements = np.sum([nSamples*np.atleast_1d(var).size*np.atleast_1d(var).itemsize])/1e9 # GB
    if memory_requirements > memorylimit: 
        raise MemoryError(f'The requested bootstrap analysis requires {memory_requirements:.2f}GB, exceeding the current memory limit {memorylimit:.2f}GB.')

    # Get ndarray shapes of all outputs
    shapeout = []
    evals = []
    for j in range(nout):
        out = np.atleast_1d(varargout[j])
        shapeout.append(np.shape(out))
        evals.append(out[np.newaxis,:])

    # Main bootstrap function
    def bootsample():
        # Get bootstrap sample
        Vsample = sample()
        # Run the analysis function
        out = fcn(Vsample)
        return out

    # Run bootsample() for all samples in series (cores=1) or in parallel (cores>1)
    if verbose : print(f'Bootstrap analysis with {cores} cores:')          
    out = _ProgressParallel(n_jobs=cores,total=nSamples,use_tqdm=verbose)(delayed(bootsample)() for _ in range(nSamples))
    
    # Post-process the outputs
    for varargout in out:
        if type(varargout) is not tuple:
            varargout = (varargout,)
        nout = len(varargout)

        # Loop over the different outputs of the function
        for j in range(nout):
            out = np.atleast_1d(varargout[j])
            if np.shape(out)!=shapeout[j]: 
                raise TypeError('Inconsistent output variable shape. One of the outputs of the analyzed function is changing its size/shape in between runs.')
            # Store outputs in an N-dimensional array
            evals[j] = np.concatenate((evals[j],out[np.newaxis,:]),0)
            
        
    # Compile statistics for all parameters from bootstrap samples
    #-------------------------------------------------------------------------------
    stats = []
    for variables in evals:
        stats.append(UQResult('bootstrap',variables))
    if len(stats)==1:
        stats = stats[0]
    return stats



#-------------------------------------------------------------------------------
class _ProgressParallel(Parallel):
    """
    Patch for joblib.Parallel

    Overrides the print_progress() method to enable the synchronous use of the TQDM bar
    even for parallel processing.  
    """
    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()
#-------------------------------------------------------------------------------