My tools of the trade for python programming.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
5.2 KiB

"""
REFERENCES:
Jackknife and Bootstrap Resampling Methods in Statistical Analysis to Correct for Bias.
P. Young
http://young.physics.ucsc.edu/jackboot.pdf
Notes on Bootstrapping
Author unspecified
http://www.math.ntu.edu.tw/~hchen/teaching/LargeSample/notes/notebootstrap.pdf
"""
import numpy
from numpy import pi, cos
from numpy.random import normal
def test1_generate_data(ndata=1000):
"""
"""
return pi / 3 + normal(size=ndata)
def test1():
global test1_dset
test1_dset = test1_generate_data()
dset = test1_dset
print "first jackknife routine: jk_generate_datasets -> jk_wstats"
dset_jk = jk_generate_datasets(dset)
cos_avg1 = jk_wstats(dset_jk, func=numpy.cos)
print cos_avg1
print "second jackknife routine: jk_generate_averages -> jk_stats_aa"
aa_jk = jk_generate_averages(dset)
cos_avg2 = jk_stats_aa(aa_jk, func=numpy.cos)
print cos_avg2
# the two results above must be identical
def test2_generate_data():
rootdir = "/home/wirawan/Work/PWQMC-77/expt/qmc/MnO/AFM2/rh.1x1x1/Opium-GFRG/vol10.41/k-0772+3780+2187.run"
srcfile = rootdir + "/measurements.h5"
from pyqmc.results.pwqmc_meas import meas_hdf5
global test2_db
test2_db = meas_hdf5(srcfile)
def jk_select_dataset(a, i):
"""Selects the i-th dataset for jackknife operation from a
given dataset 'a'.
The argument i must be: 0 <= 0 < len(a).
This is essentially deleting the i-th data point from the
original dataset.
"""
a = numpy.asarray(a)
N = a.shape[0]
assert len(a.shape) == 1
assert 0 <= i < N
rslt = numpy.empty(shape=(N-1,), dtype=a.dtype)
rslt[:i] = a[:i]
rslt[i:] = a[i+1:]
return rslt
def jk_generate_datasets(a):
"""Generates ALL the datasets for jackknife operation from
the original dataset 'a'.
For the i-th dataset, this is essentially deleting the
i-th data point from 'a'.
"""
a = numpy.asarray(a)
N = a.shape[0]
assert len(a.shape) == 1
rslt = numpy.empty(shape=(N,N-1,), dtype=a.dtype)
for i in xrange(N):
rslt[i, :i] = a[:i]
rslt[i, i:] = a[i+1:]
return rslt
def jk_generate_averages_old1(a, weights=None):
"""Generates ALL the average samples for jackknife operation
from the original dataset 'a'.
For the i-th dataset, this is essentially deleting the
i-th data point from 'a', then taking the average.
This version does not store N*(N-1) data points; only (N).
This version is SLOW because it has to compute
the averages N times---thus it still computationally scales as N**2.
"""
a = numpy.asarray(a)
N = a.shape[0]
assert len(a.shape) == 1
aa_jk = numpy.empty(shape=(N,), dtype=a.dtype)
dset_i = numpy.empty(shape=(N-1,), dtype=a.dtype)
if weights != None:
weights_i = numpy.empty(shape=(N-1,), dtype=weights.dtype)
for i in xrange(N):
dset_i[:i] = a[:i]
dset_i[i:] = a[i+1:]
if weights != None:
weights_i[:i] = weights[:i]
weights_i[i:] = weights[i+1:]
aa_jk[i] = numpy.average(dset_i, weights=weights_i)
else:
aa_jk[i] = numpy.mean(dset_i)
return aa_jk
def jk_generate_averages(a, weights=None):
"""Generates ALL the average samples for jackknife operation
from the original dataset 'a'.
For the i-th dataset, this is essentially deleting the
i-th data point from 'a', then taking the average.
This version does not store N*(N-1) data points; only (N).
This version is faster by avoiding N computations of average.
"""
a = numpy.asarray(a)
N = a.shape[0]
assert len(a.shape) == 1
if weights != None:
weights = numpy.asarray(weights)
assert weights.shape == a.shape
aw = a * weights
num = numpy.sum(aw) * 1.0
denom = numpy.sum(weights)
aa_jk = (num - aw) / (denom - weights)
else:
num = numpy.sum(a) * 1.0
aa_jk = (num - a[i]) / (N - 1)
return aa_jk
'''
def jk_stats_old(a_jk, func=None):
"""a_jk must be in the same format as that produced by
"""
# get all the jackknived stats.
if func == None:
jk_mean = numpy.mean(a_jk, axis=1)
else:
jk_mean = numpy.mean(func(a_jk), axis=1)
'''
def jk_wstats_dsets(a_jk, w_jk=None, func=None):
"""Computes the jackknife statistics from the preprocessed datasets
produced by jk_generate_datasets() routine.
The input a_jk and w_jk must be in the same format as that produced by
jk_generate_datasets.
"""
# get all the jackknived stats.
N = len(a_jk)
# reconstruct full "a" array:
a = numpy.empty(shape=(N,), dtype=a_jk.dtype)
a[1:] = a_jk[0]
a[0] = a_jk[1][0]
if func == None:
func = lambda x : x
aa_jk = numpy.average(a_jk, axis=1, weights=w_jk)
#print aa_jk
f_jk = func(aa_jk)
mean = numpy.mean(f_jk)
var = numpy.std(f_jk) * numpy.sqrt(N-1)
mean_unbiased = N * func(a.mean()) - (N-1) * mean
return (mean, var, mean_unbiased)
def jk_stats_aa(aa_jk, func=None, a=None):
"""Computes the jackknife statistics from the preprocessed
jackknife averages (aa_jk).
The input array aa_jk is computed by jk_generate_averages().
"""
# get all the jackknived stats.
N = len(aa_jk)
# reconstruct full "a" array:
if func == None:
func = lambda x : x
f_jk = func(aa_jk)
mean = numpy.mean(f_jk)
var = numpy.std(f_jk) * numpy.sqrt(N-1)
if a != None:
mean_unbiased = N * func(a.mean()) - (N-1) * mean
else:
mean_unbiased = None
return (mean, var, mean_unbiased)