From ad841e0b90f1b993e7c14d16db3b6a65f3534315 Mon Sep 17 00:00:00 2001 From: Wirawan Purwanto Date: Tue, 17 Apr 2012 11:03:29 -0400 Subject: [PATCH] * text_input.read_items(): added option `end_line_match' and `last_line_match' (mutually exclusive options) to allow the dataset reading to end upon encountering certain text pattern (or a more complicated match, if we specify a function for the option value). These options can be used to work with the `maxcount' option; the shortest of the two (maxcount records read first, or end/last_line_match finds a match) will end the reading of the dataset. --- iofmt/text_input.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/iofmt/text_input.py b/iofmt/text_input.py index 28ab69c..e430ffd 100644 --- a/iofmt/text_input.py +++ b/iofmt/text_input.py @@ -27,10 +27,25 @@ This module is part of wpylib project. import re import numpy +from wpylib.sugar import zip_gen from wpylib.file.file_utils import open_input_file from wpylib.py import make_unbound_instance_method import wpylib.py.im_weakref +def make_match_proc(match): + """Make matching procedure: simple string becomes regexp, + regexp remains regexp, and other callable object is passed as is.""" + if isinstance(match, basestring): + Regexp = re.compile(match) + match_proc = lambda x: Regexp.search(x) + elif hasattr(getattr(match, "search", None), "__call__"): + Regexp = match + match_proc = lambda x: Regexp.search(x) + else: + match_proc = match + return match_proc + + class text_input(object): '''Text input reader with support for UNIX-style comment marker (#) and standard field separation (tabs and whitespaces). @@ -167,6 +182,7 @@ class text_input(object): If the tuple contains the third field, it is used as the name of the field; otherwise the fields are named f0, f1, f2, .... + Preliminary ability to read in complex data has been added! Complex data (floating-point only) must be specified as a tuple of two columns containing the real and imaginary data, like this: ((2, 3), complex, 'ampl') @@ -177,8 +193,13 @@ class text_input(object): Additional keyword options: * deftype: default datatype * maxcount: maximum number of records to be read + * end_line_match: a regular expression or test subroutine accepting a + single argument (i.e. the text line) marking the end boundary of the list + to be read (i.e. one line past the list contents) + * last_line_match: a regular expression or test subroutine accepting a + single argument (i.e. the text line) marking the last element of the list + to be read - TODO: Needs ability to read in complex data. """ deftype = kwd.get("deftype", float) @@ -226,7 +247,28 @@ class text_input(object): cols = reg.cols flds = reg.flds get_fields = lambda vals : tuple([ filt(vals,col) for (filt,col) in cols ]) + if "maxcount" in kwd: + src_iter = zip_gen(xrange(kwd['maxcount']),self) + else: + src_iter = enumerate(self) + # FIXME below: zip() evaluates the function before the loop, thus may + # eat a lot of memory. + if 'end_line_match' in kwd: + rslt = [] + match = make_match_proc(kwd['end_line_match']) + for (c,vals) in src_iter: + if match(vals): + break + rslt.append(get_fields(vals.split())) + elif 'last_line_match' in kwd: + rslt = [] + match = make_match_proc(kwd['end_line_match']) + for (c,vals) in src_iter: + rslt.append(get_fields(vals.split())) + if match(vals): + break + elif "maxcount" in kwd: #print "hello" rslt = [ get_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ] else: