wpylib/math/stats/jackknife1.py

"""
REFERENCES:

Jackknife and Bootstrap Resampling Methods in Statistical Analysis to Correct for Bias.
P. Young
http://young.physics.ucsc.edu/jackboot.pdf


Notes on Bootstrapping
Author unspecified
http://www.math.ntu.edu.tw/~hchen/teaching/LargeSample/notes/notebootstrap.pdf


"""

import numpy

from numpy import pi, cos
from numpy.random import normal

def test1_generate_data(ndata=1000):
  """

  """
  return pi / 3 + normal(size=ndata)


def test1():
  global test1_dset
  test1_dset = test1_generate_data()
  dset = test1_dset
  print "first jackknife routine: jk_generate_datasets -> jk_wstats"
  dset_jk = jk_generate_datasets(dset)
  cos_avg1 = jk_wstats(dset_jk, func=numpy.cos)
  print cos_avg1

  print "second jackknife routine: jk_generate_averages -> jk_stats_aa"
  aa_jk = jk_generate_averages(dset)
  cos_avg2 = jk_stats_aa(aa_jk, func=numpy.cos)
  print cos_avg2

  # the two results above must be identical


def test2_generate_data():
  rootdir = "/home/wirawan/Work/PWQMC-77/expt/qmc/MnO/AFM2/rh.1x1x1/Opium-GFRG/vol10.41/k-0772+3780+2187.run"
  srcfile = rootdir + "/measurements.h5"
  from pyqmc.results.pwqmc_meas import meas_hdf5

  global test2_db
  test2_db = meas_hdf5(srcfile)


def jk_select_dataset(a, i):
  """Selects the i-th dataset for jackknife operation from a
  given dataset 'a'.
  The argument i must be: 0 <= 0 < len(a).
  This is essentially deleting the i-th data point from the
  original dataset.
  """
  a = numpy.asarray(a)
  N = a.shape[0]
  assert len(a.shape) == 1
  assert 0 <= i < N
  rslt = numpy.empty(shape=(N-1,), dtype=a.dtype)
  rslt[:i] = a[:i]
  rslt[i:] = a[i+1:]
  return rslt

def jk_generate_datasets(a):
  """Generates ALL the datasets for jackknife operation from
  the original dataset 'a'.
  For the i-th dataset, this is essentially deleting the
  i-th data point from 'a'.
  """
  a = numpy.asarray(a)
  N = a.shape[0]
  assert len(a.shape) == 1
  rslt = numpy.empty(shape=(N,N-1,), dtype=a.dtype)
  for i in xrange(N):
    rslt[i, :i] = a[:i]
    rslt[i, i:] = a[i+1:]
  return rslt

def jk_generate_averages_old1(a, weights=None):
  """Generates ALL the average samples for jackknife operation
  from the original dataset 'a'.
  For the i-th dataset, this is essentially deleting the
  i-th data point from 'a', then taking the average.

  This version does not store N*(N-1) data points; only (N).
  This version is SLOW because it has to compute
  the averages N times---thus it still computationally scales as N**2.
  """
  a = numpy.asarray(a)
  N = a.shape[0]
  assert len(a.shape) == 1
  aa_jk = numpy.empty(shape=(N,), dtype=a.dtype)
  dset_i = numpy.empty(shape=(N-1,), dtype=a.dtype)
  if weights != None:
    weights_i = numpy.empty(shape=(N-1,), dtype=weights.dtype)
  for i in xrange(N):
    dset_i[:i] = a[:i]
    dset_i[i:] = a[i+1:]
    if weights != None:
      weights_i[:i] = weights[:i]
      weights_i[i:] = weights[i+1:]
      aa_jk[i] = numpy.average(dset_i, weights=weights_i)
    else:
      aa_jk[i] = numpy.mean(dset_i)

  return aa_jk

def jk_generate_averages(a, weights=None):
  """Generates ALL the average samples for jackknife operation
  from the original dataset 'a'.
  For the i-th dataset, this is essentially deleting the
  i-th data point from 'a', then taking the average.

  This version does not store N*(N-1) data points; only (N).
  This version is faster by avoiding N computations of average.
  """
  a = numpy.asarray(a)
  N = a.shape[0]
  assert len(a.shape) == 1
  if weights != None:
    weights = numpy.asarray(weights)
    assert weights.shape == a.shape
    aw = a * weights
    num = numpy.sum(aw) * 1.0
    denom = numpy.sum(weights)
    aa_jk = (num - aw) / (denom - weights)
  else:
    num = numpy.sum(a) * 1.0
    aa_jk = (num - a[i]) / (N - 1)

  return aa_jk

'''
def jk_stats_old(a_jk, func=None):
  """a_jk must be in the same format as that produced by

  """
  # get all the jackknived stats.
  if func == None:
    jk_mean = numpy.mean(a_jk, axis=1)
  else:
    jk_mean = numpy.mean(func(a_jk), axis=1)
'''

def jk_wstats_dsets(a_jk, w_jk=None, func=None):
  """Computes the jackknife statistics from the preprocessed datasets
  produced by jk_generate_datasets() routine.
  The input a_jk and w_jk must be in the same format as that produced by
  jk_generate_datasets.
  """
  # get all the jackknived stats.
  N = len(a_jk)
  # reconstruct full "a" array:
  a = numpy.empty(shape=(N,), dtype=a_jk.dtype)
  a[1:] = a_jk[0]
  a[0] = a_jk[1][0]
  if func == None:
    func = lambda x : x
  aa_jk = numpy.average(a_jk, axis=1, weights=w_jk)
  #print aa_jk
  f_jk = func(aa_jk)
  mean = numpy.mean(f_jk)
  var = numpy.std(f_jk) * numpy.sqrt(N-1)
  mean_unbiased = N * func(a.mean()) - (N-1) * mean
  return (mean, var, mean_unbiased)


def jk_stats_aa(aa_jk, func=None, a=None):
  """Computes the jackknife statistics from the preprocessed
  jackknife averages (aa_jk).
  The input array aa_jk is computed by jk_generate_averages().
  """
  # get all the jackknived stats.
  N = len(aa_jk)
  # reconstruct full "a" array:
  if func == None:
    func = lambda x : x
  f_jk = func(aa_jk)
  mean = numpy.mean(f_jk)
  var = numpy.std(f_jk) * numpy.sqrt(N-1)
  if a != None:
    mean_unbiased = N * func(a.mean()) - (N-1) * mean
  else:
    mean_unbiased = None
  return (mean, var, mean_unbiased)