* Added my tools to parse/process textual table, from Cr2 project.

10 years ago · b93d40d0fc
parent bfc96d4335
commit b93d40d0fc
1 changed files with 92 additions and 2 deletions
--- a/iofmt/text_input.py
+++ b/iofmt/text_input.py
@ -128,10 +128,10 @@ class text_input(object):
    return self.next_(self)

  def seek_text(self, regex=None, match=None):
-    '''Seeks the file until a particular piece text is encountered.
+    """Seeks the file until a particular piece text is encountered.
    We ignore all comments.
    The `regex' argument can be either a regex string or a standard python
-    regular expression object.'''
+    regular expression object."""

    if regex:
      if isinstance(regex, basestring):
@ -349,3 +349,93 @@ def tail(filename, maxlines):
    pass
  return out[-maxlines:]

+
+
+# More tools for extracting data from table-like text stream/string.
+
+tbl_filter_num1_rx = re.compile('^\s*[-+]?(?:[0-9]+|[0-9]+\.|\.[0-9]+|[0-9]+\.[0-9]+)(?:[EeDd][-+]?[0-9]+)?')
+def tbl_filter_num1(flds, col=0, **args):
+  """Simple filter function: given a list of splitted text in `flds`,
+  if the col-th field of the row is a numerical
+  string, then it is a valid row; otherwise we will ignore this row.
+  """
+  return tbl_filter_num1_rx.match(flds[col])
+
+
+def filter_table_text(T, filter=tbl_filter_num1, filter_args={}):
+  """Filters out irrelevant text (junk) from the table by commenting them out.
+  Using the default filter, we assume that the target column (default==0)
+  is a numerical value (usually a geometry value or a similar parameter).
+
+  Input:
+  * T = a text table (a multi-line string, with the linebreaks)
+  * filter = a filter function
+  * filter_args = dict-style arguments for the filter function."""
+  Lines = T.splitlines()
+  for (i,L) in enumerate(Lines):
+    F = L.split()
+    if len(F) == 0:
+      pass
+    elif not F[0].startswith("#") and not filter(F, **filter_args):
+      Lines[i] = "#" + L
+  return "\n".join(Lines)
+
+
+class tbl_filter_num1_limited_range(object):
+  """Fancy filtering: Assume that the first column is numerical
+  (e.g., rbond); and only include rows where this `rbond` fall
+  within a given range.
+  """
+  def __init__(self, rmin, rmax, col=0):
+    self.rmin, self.rmax = rmin, rmax
+    self.col = col
+  def __call__(self, flds, **args):
+    if tbl_filter_num1_rx.match(flds[self.col]):
+      r = float(flds[0])
+      return self.rmin <= r <= self.rmax
+    else:
+      return False
+  def mk_table_filter(self):
+    return lambda T: filter_table_text(T,filter=self)
+  @classmethod
+  def create(cls, rmin, rmax, col=0):
+    o = cls(rmin, rmax, col=col)
+    func = o.mk_table_filter()
+    func.__name__ = "%s.create(%.4f,%.4f,%d)" \
+                  % (cls.__name__, rmin, rmax, col)
+    return func
+
+
+def read_table(F, maps={}):
+  """Reads in a 2-D table from a text stream.
+  Returns a list of lists containing the table content, in each cell by
+  default as a string, unless a mapping function is provided (for simple
+  data conversion only).
+
+  This is a legacy tool. It appears that numpy.genfromtxt can do what
+  this tool can do, and better.
+  You should probably check if numpy.genfromtxt can do the required job
+  before using read_table/read_table_text provided in this module.
+  """
+  rows = []
+  comment_char = "#"
+  for L in F:
+    L = L.split(comment_char,1)[0]
+    flds = L.split()
+    if len(flds) == 0:
+      continue
+    if maps:
+      for i in xrange(len(flds)):
+        if i in maps:
+          flds[i] = maps[i](flds[i])
+    rows.append(flds)
+  return rows
+
+
+def read_table_text(txt, maps={}):
+  """Reads in a 2-D table from a text stream.
+  The text (as a whole string) is given in the txt argument.
+  """
+  from StringIO import StringIO
+  return read_table(StringIO(txt), maps)
+