Source code for deerlab.bootstrap_analysis

# bootstrap_analysis.py - Bootstrap analysis for uncertainty estimation
# --------------------------------------------------------------------------
# This file is a part of DeerLab. License is MIT (see LICENSE.md).
# Copyright(c) 2019-2023: Luis Fabregas, Stefan Stoll and other contributors.

import numpy as np
import types
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from deerlab.classes import UQResult
from deerlab.utils import isnumeric
from deerlab.noiselevel import noiselevel

[docs] def bootstrap_analysis(fcn,Vexp,Vfit, samples=1000, noiselvl=None, resampling='gaussian', verbose = False, cores=1, memorylimit=8): r""" Bootstrap analysis for uncertainty quantification This function creates new synthetic datasets by using the noiseless model prediction ``Vfit`` and adding noise, according to the specified method. The ``fcn`` function is then applied to each of the new datasets to calculate the uncertainty of the variables returned by ``fcn``. Parameters ---------- fcn : callable Function to be analyzed. Must be a callable function ``fcn(V)`` accepting a dataset array as input and returning a tuple with all variables to be analyzed. All variables must be numerical arrays (no strings or booleans) and must preserve shape between calls. Vexp : array_like or list of array_like Experimental dataset(s). Vfit : array or list of array_like Fit of the dataset(s). samples : scalar, optional Number of bootstrap samples to analyze. The quality of bootstrapping results improve with the number of boostrap samples evaluated, the default is 1000. noiselvl : scalar or list thereof Noise level of the input dataset(s), specified as standard deviations. If not specified, these are automatically estimated from the experimental dataset(s). resampling : string, optional Specifies the method employed for re-sampling new bootstrap samples. * ``'gaussian'`` - Sample noise from a Gaussian distribution. * ``'residual'`` - Sample noise from the fit residuals. The default is ``'gaussian'``. cores : scalar, optional Number of CPU cores/processes for parallel computing. If ``cores=1`` no parallel computing is used. If ``cores=-1`` all available CPUs are used. The default is one core (no parallelization). verbose : boolean, optional Specifies whether to print the progress of the bootstrap analysis on the command window, the default is false. memorylimit : Memory limit to be allocated for the full boostrap analysis. If the requested analysis exceeds this limit, the execution will be stopped. The default is 12GB. Returns ------- bootuq : :ref:`UQResult` or list of :ref:`UQResult` Bootstrap uncertainty quantification for each variable returned by ``fcn``. Examples -------- To analyze several variables during the same run, the function must return tuple containing all of them:: def myfcn(V): Pfit1 = dl.nlls(V,dl.dd_gauss,r,K) Pfit2 = dl.nlls(V,dl.dd_randcoil,r,K) return Pfit1,Pfit2 bootuq = bootstrap_analysis(myfcn,V,Vfit) Pfit1_uq = bootuq[0] Pfit2_uq = bootuq[1] """ # Catch invalid sample sizes if samples<2: raise ValueError('At least two bootstrap samples are required.') nSamples = int(samples) # Validation of input signals Vexp,Vfit = ( [V] if type(V) is np.ndarray else V for V in [Vexp,Vfit]) if len(Vexp)!=len(Vfit): raise KeyError('The same number of signals V and fits Vfit must be provided.') nSignals = len(Vexp) for i in range(len(Vfit)): if len(Vexp[i])!=len(Vfit[i]): raise KeyError(f'The Vexp and Vfit #{i} must have the same number of elements.') if type(fcn) is not types.FunctionType: raise KeyError('The 1st argument must be a callable function accepting dataset(s) as input.') # Get residuals and estimate standard deviation for i in range(nSignals): residuals = [Vfit[i] - Vexp[i] for i in range(nSignals)] if noiselvl is None: noiselvl = [noiselevel(Vexp[i]) for i in range(nSignals)] noiselvl = np.atleast_1d(noiselvl) # Prepare bootstrap sampler (reduced I/O-stream when parallelized) def sample(): Vsample = [0]*nSignals for i in range(nSignals): #Determine requested re-sampling method if resampling == 'gaussian': # Resample from a Gaussian distribution with variance estimated from the residuals Vsample[i] = Vfit[i] + np.random.normal(0, noiselvl[i], len(Vfit[i])) elif resampling == 'residual': # Resample from the residual directly Vsample[i] = Vfit[i] + residuals[i][np.random.permutation(len(Vfit[i]))] else: raise KeyError("Resampling method not found. Must be either 'gaussian' or 'residual'.") return Vsample # Use original data on the first run varargout = fcn(Vexp) if type(varargout) is not tuple: varargout = (varargout,) nout = len(varargout) # Assert that all outputs are strictly numerical for var in varargout: if not all(isnumeric(x) for x in np.atleast_1d(var)): raise ValueError('Non-numeric output arguments by the analyzed function are not accepted.') # Check that the full bootstrap analysis will not exceed the memory limits memory_requirements = np.sum([nSamples*np.atleast_1d(var).size*np.atleast_1d(var).itemsize])/1e9 # GB if memory_requirements > memorylimit: raise MemoryError(f'The requested bootstrap analysis requires {memory_requirements:.2f}GB, exceeding the current memory limit {memorylimit:.2f}GB.') # Get ndarray shapes of all outputs shapeout = [] evals = [] for j in range(nout): out = np.atleast_1d(varargout[j]) shapeout.append(np.shape(out)) evals.append(out[np.newaxis,:]) # Main bootstrap function def bootsample(): # Get bootstrap sample Vsample = sample() # Run the analysis function out = fcn(Vsample) return out # Run bootsample() for all samples in series (cores=1) or in parallel (cores>1) if verbose : print(f'Bootstrap analysis with {cores} cores:') out = _ProgressParallel(n_jobs=cores,total=nSamples,use_tqdm=verbose)(delayed(bootsample)() for _ in range(nSamples)) # Post-process the outputs for varargout in out: if type(varargout) is not tuple: varargout = (varargout,) nout = len(varargout) # Loop over the different outputs of the function for j in range(nout): out = np.atleast_1d(varargout[j]) if np.shape(out)!=shapeout[j]: raise TypeError('Inconsistent output variable shape. One of the outputs of the analyzed function is changing its size/shape in between runs.') # Store outputs in an N-dimensional array evals[j] = np.concatenate((evals[j],out[np.newaxis,:]),0) # Compile statistics for all parameters from bootstrap samples #------------------------------------------------------------------------------- stats = [] for variables in evals: stats.append(UQResult('bootstrap',variables)) if len(stats)==1: stats = stats[0] return stats
#------------------------------------------------------------------------------- class _ProgressParallel(Parallel): """ Patch for joblib.Parallel Overrides the print_progress() method to enable the synchronous use of the TQDM bar even for parallel processing. """ def __init__(self, use_tqdm=True, total=None, *args, **kwargs): self._use_tqdm = use_tqdm self._total = total super().__init__(*args, **kwargs) def __call__(self, *args, **kwargs): with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar: return Parallel.__call__(self, *args, **kwargs) def print_progress(self): if self._total is None: self._pbar.total = self.n_dispatched_tasks self._pbar.n = self.n_completed_tasks self._pbar.refresh() #-------------------------------------------------------------------------------