From b93d40d0fc0edbe2772b40b45a03a1c570a538a7 Mon Sep 17 00:00:00 2001 From: Wirawan Purwanto Date: Thu, 4 Jun 2015 16:02:14 -0400 Subject: [PATCH] * Added my tools to parse/process textual table, from Cr2 project. --- iofmt/text_input.py | 94 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/iofmt/text_input.py b/iofmt/text_input.py index e430ffd..e80826d 100644 --- a/iofmt/text_input.py +++ b/iofmt/text_input.py @@ -128,10 +128,10 @@ class text_input(object): return self.next_(self) def seek_text(self, regex=None, match=None): - '''Seeks the file until a particular piece text is encountered. + """Seeks the file until a particular piece text is encountered. We ignore all comments. The `regex' argument can be either a regex string or a standard python - regular expression object.''' + regular expression object.""" if regex: if isinstance(regex, basestring): @@ -349,3 +349,93 @@ def tail(filename, maxlines): pass return out[-maxlines:] + + +# More tools for extracting data from table-like text stream/string. + +tbl_filter_num1_rx = re.compile('^\s*[-+]?(?:[0-9]+|[0-9]+\.|\.[0-9]+|[0-9]+\.[0-9]+)(?:[EeDd][-+]?[0-9]+)?') +def tbl_filter_num1(flds, col=0, **args): + """Simple filter function: given a list of splitted text in `flds`, + if the col-th field of the row is a numerical + string, then it is a valid row; otherwise we will ignore this row. + """ + return tbl_filter_num1_rx.match(flds[col]) + + +def filter_table_text(T, filter=tbl_filter_num1, filter_args={}): + """Filters out irrelevant text (junk) from the table by commenting them out. + Using the default filter, we assume that the target column (default==0) + is a numerical value (usually a geometry value or a similar parameter). + + Input: + * T = a text table (a multi-line string, with the linebreaks) + * filter = a filter function + * filter_args = dict-style arguments for the filter function.""" + Lines = T.splitlines() + for (i,L) in enumerate(Lines): + F = L.split() + if len(F) == 0: + pass + elif not F[0].startswith("#") and not filter(F, **filter_args): + Lines[i] = "#" + L + return "\n".join(Lines) + + +class tbl_filter_num1_limited_range(object): + """Fancy filtering: Assume that the first column is numerical + (e.g., rbond); and only include rows where this `rbond` fall + within a given range. + """ + def __init__(self, rmin, rmax, col=0): + self.rmin, self.rmax = rmin, rmax + self.col = col + def __call__(self, flds, **args): + if tbl_filter_num1_rx.match(flds[self.col]): + r = float(flds[0]) + return self.rmin <= r <= self.rmax + else: + return False + def mk_table_filter(self): + return lambda T: filter_table_text(T,filter=self) + @classmethod + def create(cls, rmin, rmax, col=0): + o = cls(rmin, rmax, col=col) + func = o.mk_table_filter() + func.__name__ = "%s.create(%.4f,%.4f,%d)" \ + % (cls.__name__, rmin, rmax, col) + return func + + +def read_table(F, maps={}): + """Reads in a 2-D table from a text stream. + Returns a list of lists containing the table content, in each cell by + default as a string, unless a mapping function is provided (for simple + data conversion only). + + This is a legacy tool. It appears that numpy.genfromtxt can do what + this tool can do, and better. + You should probably check if numpy.genfromtxt can do the required job + before using read_table/read_table_text provided in this module. + """ + rows = [] + comment_char = "#" + for L in F: + L = L.split(comment_char,1)[0] + flds = L.split() + if len(flds) == 0: + continue + if maps: + for i in xrange(len(flds)): + if i in maps: + flds[i] = maps[i](flds[i]) + rows.append(flds) + return rows + + +def read_table_text(txt, maps={}): + """Reads in a 2-D table from a text stream. + The text (as a whole string) is given in the txt argument. + """ + from StringIO import StringIO + return read_table(StringIO(txt), maps) +