wpylib/db/indexing_float.py

#
# wpylib.db.indexing_float
# Utilities for indexing based on floating-point values
#
# Wirawan Purwanto
# Created: 20130301
#

"""\
wpylib.db.indexing_float
Utilities for indexing based on floating-point values
"""

import numpy
import sys


def _debug_gen_float_indices1(localvars, debug):
  from wpylib.params.params_flat import Parameters as params
  L = params(localvars)
  if debug > 50:
    print "a_sorted         = ", L.a_sorted[1:]
    print "a_diff           = ", L.a_diff
    print "a_avg_abs        = ", L.a_avg_abs
    print "a_rdiff          = ", L.a_rdiff
    print
    #print "rdiff_idx_sorted =   ", L.rdiff_idx_sorted  #   numpy.array(L.rdiff_idx_sorted, dtype=float)
    print "rdiff_idx_sorted =  ", " ".join([ "%11d" % i for i in L.rdiff_idx_sorted ])
    print "too_close        =  ", " ".join([ "%11d" % int(i) for i in (L.a_rdiff[L.rdiff_idx_sorted] < L.rdiff_threshold) ])
    print "a_rdiff(sort)    = ", L.a_rdiff[L.rdiff_idx_sorted]
    print "a(sort)          = ", L.a_sorted[1:][L.rdiff_idx_sorted]
    print

def _debug_gen_float_indices2(localvars, debug):
  from wpylib.params.params_flat import Parameters as params
  L = params(localvars)
  if debug > 50:
    print
    print "a_rdiff aft      = ", L.a_rdiff
    print "num unique vals  = ", L.n_all_unique_vals
    print "num already uniq = ", len(L.a_already_unique)
    print "unique_vals      = ", L.unique_vals[0:L.n_all_unique_vals]
    print "unique_vals(sort)= ", numpy.sort(L.unique_vals[0:L.n_all_unique_vals])

def _debug_gen_float_indices_found_duplicates(localvars, debug):
  from wpylib.params.params_flat import Parameters as params
  L = params(localvars)
  if debug > 100:
    print "i=", L.i_found, " fused range is ", L.i1, ":", L.i+1
    print " rdiff", L.orig_rdiff
    print "  idx ", L.i1, L.i, ", arr ", L.a_fused_sect
    print "  avg ", L.avg

def _debug_gen_float_indices_results(localvars, debug):
  from wpylib.params.params_flat import Parameters as params
  L = params(localvars)
  if debug > 50:
    print
    print "rslt_vals        = ", L.rslt_vals
    print "unique_map       = ", L.unique_map


def generate_float_indices(arr, rdiff_threshold, debug=0):
  """Consolidates floating point values to `unique' values whose relative
  differences are greater than a specified threshold (rdiff_threshold).
  Values that are so close together will fused to their average.

  The input must be a one-dimensional array or list or a list-like iterable.
  """
  from wpylib.db.result_base import result_base
  sample = numpy.array([arr[0]])
  a_sorted = numpy.empty(len(arr)+1, dtype=sample.dtype)
  a_sorted[1:] = arr
  a_sorted[1:].sort(kind='heapsort')
  a_sorted[0] = a_sorted[1] # dummy data
  a_diff = numpy.diff(a_sorted)  # == a_sorted[1:] - a_sorted[:-1]
  a_avg_abs = (numpy.abs(a_sorted[1:]) + numpy.abs(a_sorted[:-1])) * 0.5
  # FIXME: handle case where a_avg_abs is truly ZERO -> in this case
  # the abs should be 1.
  a_rdiff = numpy.abs(a_diff) / a_avg_abs
  # hack the first rdiff since this element *must* always be present,
  # so this trick marks it as "unique":
  a_rdiff[0] = rdiff_threshold*100
  # free up the memory:
  if not debug:
    a_diff = None
    a_avg_abs = None
  # Elements whose rdiff < rdiff_cutoff should be consolidated.
  # Since there is no easy way to find these elements in bulk,
  # I resort to "sorting": :(
  rdiff_idx_sorted = numpy.argsort(a_rdiff, kind='mergesort')

  _debug_gen_float_indices1(locals(), debug)

  imax = len(rdiff_idx_sorted)
  # unique_map: mapping from original indices to unique indices
  unique_map = {}
  # unique_set: set of unique-ized elements, excluding those that
  # are distinct by their numerical distances
  unique_vals = numpy.empty((len(arr),), dtype= sample.dtype) # max len
  n_unique_vals = 0
  rslt = None
  for (last_idx,i) in enumerate(rdiff_idx_sorted):
    if a_rdiff[i] > rdiff_threshold:
      # Stop, all the rest of the values are unique.
      break
    elif a_rdiff[i] == -1:
      continue
    else:
      # If two values are adjacent (e.g. in this case
      # a_sorted[i] and a_sorted[i+1] -- note the dummy value
      # at element 0), there may be more than one values like that,
      # so we need to take care of that too.
      # This is why the lower bound of the indices below is "i1"
      # while the upper is "i".
      i_found = i
      i1 = i

      while i1 > 0 and a_rdiff[i1-1] <= rdiff_threshold: i1 -= 1
      i += 1
      while i < imax and a_rdiff[i] <= rdiff_threshold: i += 1
      orig_rdiff = a_rdiff[i1-1:i].copy()
      a_rdiff[i1-1:i] = -1

      a_fused_sect = a_sorted[i1:i+1]
      avg = numpy.mean(a_fused_sect)
      unique_vals[n_unique_vals] = avg
      for a in a_fused_sect:
        unique_map[a] = n_unique_vals
      n_unique_vals += 1

      _debug_gen_float_indices_found_duplicates(locals(), debug)

  # unique_vals will contain the unique elements.
  # - Then, copy over the rest elements who are already unique
  # - Also, complete the value-to-index lookup
  a_already_unique = [ a_sorted[i+1] for i in rdiff_idx_sorted[last_idx:] if a_rdiff[i] != -1 ]
  n_all_unique_vals = n_unique_vals + len(a_already_unique)
  unique_vals[n_unique_vals:n_all_unique_vals] = a_already_unique
  _debug_gen_float_indices2(locals(), debug)

  dn = 0
  for i in rdiff_idx_sorted[last_idx:]:
    if a_rdiff[i] == -1: continue
    a = a_sorted[i+1]
    unique_map[a] = n_unique_vals + dn
    dn += 1

  # Sort the indices based on the unique value
  rslt_sort_idx = unique_vals[:n_all_unique_vals].argsort(kind='heapsort')
  rslt_sort_ridx = dict((b,a) for (a,b) in enumerate(rslt_sort_idx))

  # Update the value-to-index lookup and return the sorted index array
  for a in unique_map.keys():
    #unique_map[a] = rslt_sort_idx[unique_map[a]]
    unique_map[a] = rslt_sort_ridx[unique_map[a]]
  rslt_vals = unique_vals[rslt_sort_idx]

  _debug_gen_float_indices_results(locals(), debug)

  return result_base(
    # list of unique indices, sorted in ascending order:
    vals=rslt_vals,
    # mapping from less-unique values to the index of the new (unique-ized) new , sorted in ascending order
    index_mapping=unique_map,
  )
* Module wpylib.db.indexing_float: utility for floating-point (FP)-based indexing, allowing tolerances to account for imprecise nature of FP numbers. Initial implementation, rather complicated. A simple rounding-based implementation can be put in later. Includes initial test. 12 years ago			`#`
			`# wpylib.db.indexing_float`
			`# Utilities for indexing based on floating-point values`
			`#`
			`# Wirawan Purwanto`
			`# Created: 20130301`
			`#`

			`"""\`
			`wpylib.db.indexing_float`
			`Utilities for indexing based on floating-point values`
			`"""`

			`import numpy`
			`import sys`


			`def _debug_gen_float_indices1(localvars, debug):`
			`from wpylib.params.params_flat import Parameters as params`
			`L = params(localvars)`
			`if debug > 50:`
			`print "a_sorted = ", L.a_sorted[1:]`
			`print "a_diff = ", L.a_diff`
			`print "a_avg_abs = ", L.a_avg_abs`
			`print "a_rdiff = ", L.a_rdiff`
			`print`
			`#print "rdiff_idx_sorted = ", L.rdiff_idx_sorted # numpy.array(L.rdiff_idx_sorted, dtype=float)`
			`print "rdiff_idx_sorted = ", " ".join([ "%11d" % i for i in L.rdiff_idx_sorted ])`
			`print "too_close = ", " ".join([ "%11d" % int(i) for i in (L.a_rdiff[L.rdiff_idx_sorted] < L.rdiff_threshold) ])`
			`print "a_rdiff(sort) = ", L.a_rdiff[L.rdiff_idx_sorted]`
			`print "a(sort) = ", L.a_sorted[1:][L.rdiff_idx_sorted]`
			`print`

			`def _debug_gen_float_indices2(localvars, debug):`
			`from wpylib.params.params_flat import Parameters as params`
			`L = params(localvars)`
			`if debug > 50:`
			`print`
			`print "a_rdiff aft = ", L.a_rdiff`
			`print "num unique vals = ", L.n_all_unique_vals`
			`print "num already uniq = ", len(L.a_already_unique)`
			`print "unique_vals = ", L.unique_vals[0:L.n_all_unique_vals]`
			`print "unique_vals(sort)= ", numpy.sort(L.unique_vals[0:L.n_all_unique_vals])`

			`def _debug_gen_float_indices_found_duplicates(localvars, debug):`
			`from wpylib.params.params_flat import Parameters as params`
			`L = params(localvars)`
			`if debug > 100:`
			`print "i=", L.i_found, " fused range is ", L.i1, ":", L.i+1`
			`print " rdiff", L.orig_rdiff`
			`print " idx ", L.i1, L.i, ", arr ", L.a_fused_sect`
			`print " avg ", L.avg`

			`def _debug_gen_float_indices_results(localvars, debug):`
			`from wpylib.params.params_flat import Parameters as params`
			`L = params(localvars)`
			`if debug > 50:`
			`print`
			`print "rslt_vals = ", L.rslt_vals`
			`print "unique_map = ", L.unique_map`



			`def generate_float_indices(arr, rdiff_threshold, debug=0):`
			"""Consolidates floating point values to `unique' values whose relative
			`differences are greater than a specified threshold (rdiff_threshold).`
			`Values that are so close together will fused to their average.`

			`The input must be a one-dimensional array or list or a list-like iterable.`
			`"""`
			`from wpylib.db.result_base import result_base`
			`sample = numpy.array([arr[0]])`
			`a_sorted = numpy.empty(len(arr)+1, dtype=sample.dtype)`
			`a_sorted[1:] = arr`
			`a_sorted[1:].sort(kind='heapsort')`
			`a_sorted[0] = a_sorted[1] # dummy data`
			`a_diff = numpy.diff(a_sorted) # == a_sorted[1:] - a_sorted[:-1]`
			`a_avg_abs = (numpy.abs(a_sorted[1:]) + numpy.abs(a_sorted[:-1])) * 0.5`
* Documentation & test update. 10 years ago			`# FIXME: handle case where a_avg_abs is truly ZERO -> in this case`
			`# the abs should be 1.`
* Module wpylib.db.indexing_float: utility for floating-point (FP)-based indexing, allowing tolerances to account for imprecise nature of FP numbers. Initial implementation, rather complicated. A simple rounding-based implementation can be put in later. Includes initial test. 12 years ago			`a_rdiff = numpy.abs(a_diff) / a_avg_abs`
			`# hack the first rdiff since this element must always be present,`
			`# so this trick marks it as "unique":`
			`a_rdiff[0] = rdiff_threshold*100`
			`# free up the memory:`
			`if not debug:`
			`a_diff = None`
			`a_avg_abs = None`
			`# Elements whose rdiff < rdiff_cutoff should be consolidated.`
			`# Since there is no easy way to find these elements in bulk,`
			`# I resort to "sorting": :(`
			`rdiff_idx_sorted = numpy.argsort(a_rdiff, kind='mergesort')`

			`_debug_gen_float_indices1(locals(), debug)`

			`imax = len(rdiff_idx_sorted)`
			`# unique_map: mapping from original indices to unique indices`
			`unique_map = {}`
			`# unique_set: set of unique-ized elements, excluding those that`
			`# are distinct by their numerical distances`
			`unique_vals = numpy.empty((len(arr),), dtype= sample.dtype) # max len`
			`n_unique_vals = 0`
			`rslt = None`
			`for (last_idx,i) in enumerate(rdiff_idx_sorted):`
			`if a_rdiff[i] > rdiff_threshold:`
			`# Stop, all the rest of the values are unique.`
			`break`
			`elif a_rdiff[i] == -1:`
			`continue`
			`else:`
			`# If two values are adjacent (e.g. in this case`
			`# a_sorted[i] and a_sorted[i+1] -- note the dummy value`
			`# at element 0), there may be more than one values like that,`
			`# so we need to take care of that too.`
			`# This is why the lower bound of the indices below is "i1"`
			`# while the upper is "i".`
			`i_found = i`
			`i1 = i`

			`while i1 > 0 and a_rdiff[i1-1] <= rdiff_threshold: i1 -= 1`
			`i += 1`
			`while i < imax and a_rdiff[i] <= rdiff_threshold: i += 1`
			`orig_rdiff = a_rdiff[i1-1:i].copy()`
			`a_rdiff[i1-1:i] = -1`

			`a_fused_sect = a_sorted[i1:i+1]`
			`avg = numpy.mean(a_fused_sect)`
			`unique_vals[n_unique_vals] = avg`
			`for a in a_fused_sect:`
			`unique_map[a] = n_unique_vals`
			`n_unique_vals += 1`

			`_debug_gen_float_indices_found_duplicates(locals(), debug)`

			`# unique_vals will contain the unique elements.`
			`# - Then, copy over the rest elements who are already unique`
			`# - Also, complete the value-to-index lookup`
			`a_already_unique = [ a_sorted[i+1] for i in rdiff_idx_sorted[last_idx:] if a_rdiff[i] != -1 ]`
			`n_all_unique_vals = n_unique_vals + len(a_already_unique)`
			`unique_vals[n_unique_vals:n_all_unique_vals] = a_already_unique`
			`_debug_gen_float_indices2(locals(), debug)`

			`dn = 0`
			`for i in rdiff_idx_sorted[last_idx:]:`
			`if a_rdiff[i] == -1: continue`
			`a = a_sorted[i+1]`
			`unique_map[a] = n_unique_vals + dn`
			`dn += 1`

			`# Sort the indices based on the unique value`
			`rslt_sort_idx = unique_vals[:n_all_unique_vals].argsort(kind='heapsort')`
			`rslt_sort_ridx = dict((b,a) for (a,b) in enumerate(rslt_sort_idx))`

			`# Update the value-to-index lookup and return the sorted index array`
			`for a in unique_map.keys():`
			`#unique_map[a] = rslt_sort_idx[unique_map[a]]`
			`unique_map[a] = rslt_sort_ridx[unique_map[a]]`
			`rslt_vals = unique_vals[rslt_sort_idx]`

			`_debug_gen_float_indices_results(locals(), debug)`

			`return result_base(`
			`# list of unique indices, sorted in ascending order:`
			`vals=rslt_vals,`
			`# mapping from less-unique values to the index of the new (unique-ized) new , sorted in ascending order`
			`index_mapping=unique_map,`
			`)`