wpylib/math/stats/array_stats.py

# $Id: __init__.py,v 1.1 2010-10-07 15:57:29 wirawan Exp $
#
# wpylib.math.stats.array_stats module
# Created: 20140404
# Wirawan Purwanto
#

"""
wpylib.math.stats.array_stats module
Tools for studying the statistics of an array, the difference of
two or more arrays, etc.

"""

import os
import sys
import numpy


# Functions below were imported from V2b_inspect.py, 20140404

"""
CONVENTIONS

- Usually, M1 is the matrix under examination, M2 is the reference matrix.


PRINCIPLES

- Make sure functions are dimension-independent as much as possible
  (i.e. not limited to 1D or 2D only).
- Text output should not hardwired only to sys.stdout via vanilla print
  statement.
"""

def rms(m):
  return numpy.sqrt(numpy.sum(m**2) / m.size)


def report_diff_stat(M1, M2, out=sys.stdout):
  # Original function name: statdiff
  """Studies the difference of two arrays (or matrices).
  Usually, M1 is the matrix under examination, M2 is the reference
  matrix.
  Prints out standard report to a given text file, or returns
  the report as a string.
  """
  dM = M1 - M2
  if len(M1.shape) == 2:
    dM_diag = numpy.diagonal(dM)
    stats = (
      dM.max(),
      dM.min(),
      dM.mean(),
      numpy.abs(dM).mean(),
      rms(dM),
      dM_diag.max(),
      dM_diag.min(),
      dM_diag.mean(),
      numpy.abs(dM_diag).mean(),
      rms(dM_diag),
    )
    if out == tuple:
      return stats

    rslt = """\
- max difference  = %13.6e
- min difference  = %13.6e
- mean difference = %13.6e
- mean abs diff   = %13.6e
- rms difference  = %13.6e
- max difference  = %13.6e   on diagonal
- min difference  = %13.6e   on diagonal
- mean difference = %13.6e   on diagonal
- mean abs diff   = %13.6e   on diagonal
- rms difference  = %13.6e   on diagonal
""" % stats
  else:
    stats = (
      dM.max(),
      dM.min(),
      dM.mean(),
      numpy.abs(dM).mean(),
      rms(dM),
    )
    if out == tuple:
      return stats

    rslt = """\
- max difference  = %13.6e
- min difference  = %13.6e
- mean difference = %13.6e
- mean abs diff   = %13.6e
- rms difference  = %13.6e
""" % stats

  if out == str:
    return rslt
  else:
    out.write(rslt)
    out.flush()


def maxadiff(M1, M2):
  # Original function name: maxdiff
  """Returns the maximum absolute difference between two matrices.
  Used for checking whether two matrices are identical."""
  return numpy.abs(M1 - M2).max()


def maxadiff_by_col(M1, M2):
  # Original function name: maxdiff_by_col
  """Assuming two-dimensional array (where first index is the row
  index), computes the maximum absolute difference.
  """
  return numpy.abs(M1 - M2).max(axis=0)


def maxradiff(M1, M2, ztol=1e-15):
  # Original function name: maxrdiff
  """Returns the maximum *relative* absolute difference between two matrices.
  Used for checking whether two matrices are identical.

  CAVEATS:
  - Zero elements in M2 (reference matrix) are converted to unity for
    calculating the relative difference.
  """
  (mapprox, mref) = (M1, M2)
  diff = numpy.asarray(numpy.abs(mapprox - mref))
  mref0 = numpy.asarray(numpy.abs(mref))
  numpy.putmask(mref0, mref0 < ztol, [1.0])

  return (diff / mref0).max()


def rdiff(M1, M2, ztol=1e-15, Abs=False):
  """Returns the *relative* [absolute] difference between two matrices.
  Used for checking whether two matrices are identical.
  Here, M1 is the array being compared, and M2 is the reference array.

  CAVEATS:
  - Zero elements in M2 (reference matrix) are converted to unity for
    calculating the relative difference.
  """
  from numpy import abs, array, asarray
  (mapprox, mref) = (M1, M2)
  if Abs:
    diff = abs(asarray(mapprox) - asarray(mref))
    mref0 = abs(asarray(mref))
    numpy.putmask(mref0, mref0 < ztol, [1.0])
  else:
    diff = asarray(mapprox) - asarray(mref)
    mref0 = array(mref, copy=True)  # must make a copy
    numpy.putmask(mref0, abs(mref0) < ztol, [1.0])

  return (diff / mref0)


class ArrayStat(object):
  """A class to compute the statistics of an array.
  """
  def __init__(self, mat, save=False):
    mat = numpy.asarray(mat)
    amat = numpy.abs(mat)
    if save:
      self.mat = mat
    self.min = mat.min()
    self.max = mat.max()
    self.mean = mat.mean()
    self.amin = amat.min()
    self.amax = amat.max()
    self.amean = amat.mean()
    self.rms = numpy.sqrt(numpy.sum(amat**2) / amat.size)

  def report(self, out=sys.stdout):
    """Prints out a standard report.
    """
    #print_histogram(study_sparsity(self.diff, (log_delta-5.5,log_delta+4.5+0.1)), xaxis='left_edge')
    rslt = """\
. min       = %12.6g
. max       = %12.6g
. mean      = %12.6g
. absmin    = %12.6g
. absmax    = %12.6g
. absmean   = %12.6g
. rms       = %12.6g
""" % (self.min, self.max, self.mean,
       self.amin, self.amax, self.amean,
       self.rms)

    if out == str:
      return rslt
    else:
      out.write(rslt)
      out.flush()


def study_sparsity(M, binrange=(-16.5,6.5,1.0), kind='log10x', minzero=1e-30, args={}):
  # Adapted from V2b_inspect.py research module.
  """Study the distribution of values or sparsity of a given array, M,
  using histogram.

  Returns a tuple of length N; the first (N-2) elements are the
  histogram() output, while the last two are the number of elements
  whose values are smaller (greater) than the lower (upper) bound of
  the histogram bind range.

  Argument 'kind': By default, the 'x' values of the histogram is on
  logarithmic-10 scale.
  Other common log scales can be chosen using 'log2x' and 'logx'.
  Choose 'linx' if you want the linear scale instead.

  Argument 'binrange' can be one of the following:
  - 3-tuple: the left edge of the first bin, the right edge of the last bin,
    and the width of each bin
  - 2-tuple: the left MIDPOINT of the first bin, the right MIDPOINT
    of the last bin; the width is assumed to be 1.

  The defaults are a 3-tuple, intended for log10x scale, where
  we check order-of-magnitudes between 1e-16 and 1e+5 (roughly).
  """
  from numpy import abs, count_nonzero, arange, histogram, log10, log2, log
  if kind == 'log10x': # usual way, for broad view
    log_val = log10(abs(M.flatten()) + minzero)
  elif kind == 'log2x':
    log_val = log2(abs(M.flatten()) + minzero)
  elif kind == 'logx':
    log_val = log(abs(M.flatten()) + minzero)
  elif kind == 'linx': # linear x scale, usually for more detailed view
    log_val = abs(M.flatten())
  else:
    raise ValueError, "Invalid kind=%s" % (str(kind))
  if len(binrange) == 3:
    binedges = numpy.arange(*binrange)
  elif len(binrange) == 2:
    l = binrange[0]-0.5
    r = binrange[1]+0.5
    binedges = numpy.arange(l,r)
  else:
    raise ValueError, "Invalid binrange parameter value"
  #print binedges
  hist = histogram(log_val, bins=binedges, **args)
  # Count values outside the range being considered:
  l = 0
  r = 0
  llim = binedges[0]
  rlim = binedges[-1]
  l = count_nonzero(log_val < llim)
  r = count_nonzero(log_val > rlim)
  return hist + (l,r)


def print_histogram(hist, xaxis='midpt',
                    # output formatting
                    xwidth=3, xprec=0,
                    out=sys.stdout):
  # Adapted from V2b_inspect.py research module.
  """Prints histogram in an easier way to visualize in text fashion.
  """
  if len(hist) >= 4:
    # special case: for study_sparsity output.
    (bar,edges,lexcess,rexcess) = hist
  else:
    # for all other cases--
    (bar,edges) = hist[:2]
    lexcess = 0
    rexcess = 0
  if xaxis == 'midpt':
    midpt = (edges[1:] + edges[:-1]) * 0.5
  elif xaxis == 'leftpt':
    midpt = edges[:-1]
  elif xaxis == 'rightpt':
    midpt = edges[1:]
  else:
    raise ValueError, "Invalid xaxis parameter value"
  width = max(int(numpy.log10(max(bar.max(), lexcess, rexcess))+1), xwidth)
  #print "width = %d" % width
  barfmt = " %" + str(width) + "d"
  midfmt = " %" + str(width) + "." + str(xprec) + "f"
  if out == str:
    from StringIO import StringIO
    out = StringIO()
    def retval():
      return out.getvalue()
  else:
    def retval():
      return None

  def Print(x):
    out.write(x)

  out.write((barfmt % lexcess) \
            + "".join([ barfmt % i for i in bar ]) \
            + (barfmt % rexcess) \
            + "\n")
  out.write(" " * width + "<" \
            + "".join([ midfmt % i for i in midpt ]) \
            + " " * width + ">" \
            + "\n")

  return retval()