* text_input.read_items(): added option `end_line_match' and `last_line_match'

(mutually exclusive options) to allow the dataset reading to end upon encountering certain text pattern (or a more complicated match, if we specify a function for the option value). These options can be used to work with the `maxcount' option; the shortest of the two (maxcount records read first, or end/last_line_match finds a match) will end the reading of the dataset.
13 years ago · ad841e0b90
parent 501552a65a
commit ad841e0b90
1 changed files with 43 additions and 1 deletions
--- a/iofmt/text_input.py
+++ b/iofmt/text_input.py
@ -27,10 +27,25 @@ This module is part of wpylib project.
 import re
 import numpy

+from wpylib.sugar import zip_gen
 from wpylib.file.file_utils import open_input_file
 from wpylib.py import make_unbound_instance_method
 import wpylib.py.im_weakref

+def make_match_proc(match):
+  """Make matching procedure: simple string becomes regexp,
+  regexp remains regexp, and other callable object is passed as is."""
+  if isinstance(match, basestring):
+    Regexp = re.compile(match)
+    match_proc = lambda x: Regexp.search(x)
+  elif hasattr(getattr(match, "search", None), "__call__"):
+    Regexp = match
+    match_proc = lambda x: Regexp.search(x)
+  else:
+    match_proc = match
+  return match_proc
+
+
 class text_input(object):
  '''Text input reader with support for UNIX-style comment marker (#) and
  standard field separation (tabs and whitespaces).
@ -167,6 +182,7 @@ class text_input(object):
    If the tuple contains the third field, it is used as the name of the field;
    otherwise the fields are named f0, f1, f2, ....

+    Preliminary ability to read in complex data has been added!
    Complex data (floating-point only) must be specified as a tuple of two columns
    containing the real and imaginary data, like this:
       ((2, 3), complex, 'ampl')
@ -177,8 +193,13 @@ class text_input(object):
    Additional keyword options:
    * deftype: default datatype
    * maxcount: maximum number of records to be read
+    * end_line_match: a regular expression or test subroutine accepting a
+      single argument (i.e. the text line) marking the end boundary of the list
+      to be read (i.e. one line past the list contents)
+    * last_line_match: a regular expression or test subroutine accepting a
+      single argument (i.e. the text line) marking the last element of the list
+      to be read

-    TODO: Needs ability to read in complex data.
    """
    deftype = kwd.get("deftype", float)

@ -226,7 +247,28 @@ class text_input(object):
    cols = reg.cols
    flds = reg.flds
    get_fields = lambda vals : tuple([ filt(vals,col) for (filt,col) in cols ])
+
    if "maxcount" in kwd:
+      src_iter = zip_gen(xrange(kwd['maxcount']),self)
+    else:
+      src_iter = enumerate(self)
+    # FIXME below: zip() evaluates the function before the loop, thus may
+    # eat a lot of memory.
+    if 'end_line_match' in kwd:
+      rslt = []
+      match = make_match_proc(kwd['end_line_match'])
+      for (c,vals) in src_iter:
+        if match(vals):
+          break
+        rslt.append(get_fields(vals.split()))
+    elif 'last_line_match' in kwd:
+      rslt = []
+      match = make_match_proc(kwd['end_line_match'])
+      for (c,vals) in src_iter:
+        rslt.append(get_fields(vals.split()))
+        if match(vals):
+          break
+    elif "maxcount" in kwd:
      #print "hello"
      rslt = [ get_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ]
    else: