From 099f3e7e060cfd14226e0dfb8ec449dd2ce50bdc Mon Sep 17 00:00:00 2001
From: Wirawan Purwanto <wirawan0@gmail.com>
Date: Sun, 3 Mar 2013 09:15:43 -0500
Subject: [PATCH] * Module wpylib.db.indexing_float: utility for floating-point
 (FP)-based   indexing, allowing tolerances to account for imprecise nature of
 FP   numbers.

  Initial implementation, rather complicated.
  A simple rounding-based implementation can be put in later.

  Includes initial test.
---
 db/indexing_float.py      | 166 ++++++++++++++++++++++++++++++++++++++
 db/test_indexing_float.py |  59 ++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 db/indexing_float.py
 create mode 100644 db/test_indexing_float.py

diff --git a/db/indexing_float.py b/db/indexing_float.py
new file mode 100644
index 0000000..24b81dc
--- /dev/null
+++ b/db/indexing_float.py
@@ -0,0 +1,166 @@
+#
+# wpylib.db.indexing_float
+# Utilities for indexing based on floating-point values
+#
+# Wirawan Purwanto
+# Created: 20130301
+#
+
+"""\
+wpylib.db.indexing_float
+Utilities for indexing based on floating-point values
+"""
+
+import numpy
+import sys
+
+
+def _debug_gen_float_indices1(localvars, debug):
+  from wpylib.params.params_flat import Parameters as params
+  L = params(localvars)
+  if debug > 50:
+    print "a_sorted         = ", L.a_sorted[1:]
+    print "a_diff           = ", L.a_diff
+    print "a_avg_abs        = ", L.a_avg_abs
+    print "a_rdiff          = ", L.a_rdiff
+    print
+    #print "rdiff_idx_sorted =   ", L.rdiff_idx_sorted  #   numpy.array(L.rdiff_idx_sorted, dtype=float)
+    print "rdiff_idx_sorted =  ", " ".join([ "%11d" % i for i in L.rdiff_idx_sorted ])
+    print "too_close        =  ", " ".join([ "%11d" % int(i) for i in (L.a_rdiff[L.rdiff_idx_sorted] < L.rdiff_threshold) ])
+    print "a_rdiff(sort)    = ", L.a_rdiff[L.rdiff_idx_sorted]
+    print "a(sort)          = ", L.a_sorted[1:][L.rdiff_idx_sorted]
+    print
+
+def _debug_gen_float_indices2(localvars, debug):
+  from wpylib.params.params_flat import Parameters as params
+  L = params(localvars)
+  if debug > 50:
+    print
+    print "a_rdiff aft      = ", L.a_rdiff
+    print "num unique vals  = ", L.n_all_unique_vals
+    print "num already uniq = ", len(L.a_already_unique)
+    print "unique_vals      = ", L.unique_vals[0:L.n_all_unique_vals]
+    print "unique_vals(sort)= ", numpy.sort(L.unique_vals[0:L.n_all_unique_vals])
+
+def _debug_gen_float_indices_found_duplicates(localvars, debug):
+  from wpylib.params.params_flat import Parameters as params
+  L = params(localvars)
+  if debug > 100:
+    print "i=", L.i_found, " fused range is ", L.i1, ":", L.i+1
+    print " rdiff", L.orig_rdiff
+    print "  idx ", L.i1, L.i, ", arr ", L.a_fused_sect
+    print "  avg ", L.avg
+
+def _debug_gen_float_indices_results(localvars, debug):
+  from wpylib.params.params_flat import Parameters as params
+  L = params(localvars)
+  if debug > 50:
+    print
+    print "rslt_vals        = ", L.rslt_vals
+    print "unique_map       = ", L.unique_map
+
+
+
+def generate_float_indices(arr, rdiff_threshold, debug=0):
+  """Consolidates floating point values to `unique' values whose relative
+  differences are greater than a specified threshold (rdiff_threshold).
+  Values that are so close together will fused to their average.
+
+  The input must be a one-dimensional array or list or a list-like iterable.
+  """
+  from wpylib.db.result_base import result_base
+  sample = numpy.array([arr[0]])
+  a_sorted = numpy.empty(len(arr)+1, dtype=sample.dtype)
+  a_sorted[1:] = arr
+  a_sorted[1:].sort(kind='heapsort')
+  a_sorted[0] = a_sorted[1] # dummy data
+  a_diff = numpy.diff(a_sorted)  # == a_sorted[1:] - a_sorted[:-1]
+  a_avg_abs = (numpy.abs(a_sorted[1:]) + numpy.abs(a_sorted[:-1])) * 0.5
+  a_rdiff = numpy.abs(a_diff) / a_avg_abs
+  # hack the first rdiff since this element *must* always be present,
+  # so this trick marks it as "unique":
+  a_rdiff[0] = rdiff_threshold*100
+  # free up the memory:
+  if not debug:
+    a_diff = None
+    a_avg_abs = None
+  # Elements whose rdiff < rdiff_cutoff should be consolidated.
+  # Since there is no easy way to find these elements in bulk,
+  # I resort to "sorting": :(
+  rdiff_idx_sorted = numpy.argsort(a_rdiff, kind='mergesort')
+
+  _debug_gen_float_indices1(locals(), debug)
+
+  imax = len(rdiff_idx_sorted)
+  # unique_map: mapping from original indices to unique indices
+  unique_map = {}
+  # unique_set: set of unique-ized elements, excluding those that
+  # are distinct by their numerical distances
+  unique_vals = numpy.empty((len(arr),), dtype= sample.dtype) # max len
+  n_unique_vals = 0
+  rslt = None
+  for (last_idx,i) in enumerate(rdiff_idx_sorted):
+    if a_rdiff[i] > rdiff_threshold:
+      # Stop, all the rest of the values are unique.
+      break
+    elif a_rdiff[i] == -1:
+      continue
+    else:
+      # If two values are adjacent (e.g. in this case
+      # a_sorted[i] and a_sorted[i+1] -- note the dummy value
+      # at element 0), there may be more than one values like that,
+      # so we need to take care of that too.
+      # This is why the lower bound of the indices below is "i1"
+      # while the upper is "i".
+      i_found = i
+      i1 = i
+
+      while i1 > 0 and a_rdiff[i1-1] <= rdiff_threshold: i1 -= 1
+      i += 1
+      while i < imax and a_rdiff[i] <= rdiff_threshold: i += 1
+      orig_rdiff = a_rdiff[i1-1:i].copy()
+      a_rdiff[i1-1:i] = -1
+
+      a_fused_sect = a_sorted[i1:i+1]
+      avg = numpy.mean(a_fused_sect)
+      unique_vals[n_unique_vals] = avg
+      for a in a_fused_sect:
+        unique_map[a] = n_unique_vals
+      n_unique_vals += 1
+
+      _debug_gen_float_indices_found_duplicates(locals(), debug)
+
+  # unique_vals will contain the unique elements.
+  # - Then, copy over the rest elements who are already unique
+  # - Also, complete the value-to-index lookup
+  a_already_unique = [ a_sorted[i+1] for i in rdiff_idx_sorted[last_idx:] if a_rdiff[i] != -1 ]
+  n_all_unique_vals = n_unique_vals + len(a_already_unique)
+  unique_vals[n_unique_vals:n_all_unique_vals] = a_already_unique
+  _debug_gen_float_indices2(locals(), debug)
+
+  dn = 0
+  for i in rdiff_idx_sorted[last_idx:]:
+    if a_rdiff[i] == -1: continue
+    a = a_sorted[i+1]
+    unique_map[a] = n_unique_vals + dn
+    dn += 1
+
+  # Sort the indices based on the unique value
+  rslt_sort_idx = unique_vals[:n_all_unique_vals].argsort(kind='heapsort')
+  rslt_sort_ridx = dict((b,a) for (a,b) in enumerate(rslt_sort_idx))
+
+  # Update the value-to-index lookup and return the sorted index array
+  for a in unique_map.keys():
+    #unique_map[a] = rslt_sort_idx[unique_map[a]]
+    unique_map[a] = rslt_sort_ridx[unique_map[a]]
+  rslt_vals = unique_vals[rslt_sort_idx]
+
+  _debug_gen_float_indices_results(locals(), debug)
+
+  return result_base(
+    # list of unique indices, sorted in ascending order:
+    vals=rslt_vals,
+    # mapping from less-unique values to the index of the new (unique-ized) new , sorted in ascending order
+    index_mapping=unique_map,
+  )
+
diff --git a/db/test_indexing_float.py b/db/test_indexing_float.py
new file mode 100644
index 0000000..6f97923
--- /dev/null
+++ b/db/test_indexing_float.py
@@ -0,0 +1,59 @@
+from numpy import array, concatenate
+from wpylib.db.indexing_float import generate_float_indices
+
+indices1 = array([ 0.80038202,  0.28583295,  0.13505145,  0.79425102,  0.52347217,  0.47955401,  0.07961833,  0.1024241 ,  0.26336713,  0.15990201,  0.81311686,  0.98632763,  0.08275991,
+      0.56862337,  0.5679713 ,  0.04377884,  0.93023717,  0.60270102,  0.24538933,  0.63922544])
+indices2 = array([ 0.69053462,  0.09864655,  0.86209023,  0.26140917,  0.8086512 ,  0.13796145,  0.1770305 ,  0.05061917,  0.81191537,  0.72801096,  0.01129504,  0.13962617,  0.56217892,
+      0.94299591,  0.99302594,  0.01167897,  0.54827444,  0.20160252,  0.86603525,  0.20260494])
+
+
+def Test_1():
+  indices_raw = concatenate((indices1, indices2))
+  keys1 = numpy.sort(indices_raw)
+  keys1_test10 = keys1[-10:]
+
+  ans = generate_float_indices(keys1_test10, 1e-2, debug=101)
+  """ans must be:
+   {
+     'vals': array([ 0.80038202,  0.81122781,  0.86406274,  0.93023717,  0.94299591,  0.98967679]),
+     'index_mapping': \
+       {0.80038201815850551: 0,
+        0.80865119885060532: 1,
+        0.81191536625506044: 1,
+        0.8131168633197402: 1,
+        0.8620902343091833: 2,
+        0.86603524560901635: 2,
+        0.93023716796725509: 3,
+        0.94299590915079168: 4,
+        0.98632763033630222: 5,
+        0.99302594015368861: 5}
+   }
+  """
+  return ans
+
+
+def Test_1b():
+  indices_raw = concatenate((indices1, indices2))
+  keys1 = numpy.sort(indices_raw)
+  keys1_test10 = concatenate((keys1[-10:], [1.03]))
+
+  ans = generate_float_indices(keys1_test10, 1e-2, debug=101)
+  """ans must be:
+   {
+     'vals': array([ 0.80038202,  0.81122781,  0.86406274,  0.93023717,  0.94299591,  0.98967679,  1.03      ]),
+     'index_mapping': \
+       {0.80038202000000003: 0,
+        0.80865120000000001: 1,
+        0.81191537000000003: 1,
+        0.81311686000000005: 1,
+        0.86209022999999996: 2,
+        0.86603525000000003: 2,
+        0.93023716999999995: 3,
+        0.94299591000000005: 4,
+        0.98632763000000001: 5,
+        0.99302594: 5,
+        1.03: 6}
+   }
+  """
+  return ans
+