From cae9cabdeaab32b6ba5a7e6fd5e7aa37230fc973 Mon Sep 17 00:00:00 2001 From: wirawan Date: Mon, 27 Sep 2010 19:54:05 +0000 Subject: [PATCH] * Simple text input reader. Imported from pyqmc.utils.text_input . --- iofmt/text_input.py | 211 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 iofmt/text_input.py diff --git a/iofmt/text_input.py b/iofmt/text_input.py new file mode 100644 index 0000000..ac4ff29 --- /dev/null +++ b/iofmt/text_input.py @@ -0,0 +1,211 @@ +#!/usr/bin/python +# $Id: text_input.py,v 1.1 2010-09-27 19:54:05 wirawan Exp $ +# +# wpylib.iofmt.text_input module +# Quick-n-dirty text input utilities +# +# Wirawan Purwanto +# Created: 20090601 +# +# Routines put here are commonly used in my own scripts. +# They are not necessarily suitable for general-purpose uses; evaluate +# your needs and see if they can them as well. +# +# 20090601: Created as pyqmc.utils.text_input . +# 20100927: Moved to wpylib.iofmt.text_input . +# +# TODO +# - book-keep the line number. Also note superfile must have its own line +# number keeping. +# +""" +Simple text-based input reader. + +This module is part of wpylib project. +""" + +import re +import numpy + +from wpylib.file.file_utils import open_input_file + + +class text_input(object): + '''Text input reader with support for UNIX-style comment marker (#) and + standard field separation (tabs and whitespaces). + Used for quick and dirty data reading (iterating only once in forward + direction without the need of rewinding or skipping). + This object can be treated like an input file, e.g. used as an iterator, + etc. + + To support more fancy options (e.g., rewinding), use "superize=1" when + creating the instance.''' + + def __init__(self, fname, **opts): + if opts.get("superize", 0): + open_opts = { "superize" : opts["superize"] } + del opts["superize"] + else: + open_opts = {} + self.file = open_input_file(fname, **open_opts) + # field_filtering_proc field can be used to filter unwanted fields, or do + # some additional transformations before final feed to the main iteration. + self.field_filtering_proc = lambda flds : flds + # Default fancy options: + self.skip_blank_lines = True + if len(opts) > 0: + self.set_options(**opts) + + def __del__(self): + if getattr(self, "file", None): + self.file.close() + + def __iter__(self): + return self + + """ + def next(self): + while True: + L = self.file.next() + F = self.field_filtering_proc(L.split("#")[0].split()) + if len(F) > 0: + return F + """ + + def next_rec(self): + '''Yields the next record, which is already separated into fields.''' + while True: + L = self.file.next() + F = self.field_filtering_proc(L.split("#")[0].split()) + if len(F) > 0 or not self.skip_blank_lines: + return F + + def next_line(self): + '''Yields the next line, which is already separated into fields.''' + while True: + L = self.file.next() + F = self.field_filtering_proc(L.split("#")[0].rstrip()) + if len(F) > 0 or not self.skip_blank_lines: + return F + + # Do NOT touch the "next" field below unless you know what you're doing: + next = next_line + + def seek_text(self, regex=None, match=None): + '''Seeks the file until a particular piece text is encountered. + We ignore all comments. + The `regex' argument can be either a regex string or a standard python + regular expression object.''' + + if regex: + if isinstance(regex, str): + Regexp = re.compile(regex) + else: + Regexp = regex + match_proc = lambda x: Regexp.search(x) + else: + match_proc = match + + while True: + L = self.next_line() + if match_proc(L): + return L + + + def read_floats(self, *cols, **kwd): + """Quickly reads a set of floats from a text file. + Returns a numpy array of the values in double precision. + + Example usage: + >>> arr = text_input("/tmp/file.txt").read_floats(0, 2, 3) + to read columns 1, 3, and 4 of the text file /tmp/file.txt, while disregarding + comments. + """ + # float_fields extracts the desired columns and converts them to floats + float_fields = lambda vals : [ float(vals[col]) for col in cols ] + if "maxcount" in kwd: + rslt = [ float_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ] + else: + rslt = [ float_fields(vals.split()) for vals in self ] + # finally convert them to a numpy ndarray: + return numpy.array(rslt) + + def read_items(self, *col_desc, **kwd): + """Quickly reads a set of items from records of whitespace-separated fields + in a text file. + Returns a structured numpy array of the values read. + + Example usage: + + >>> arr = text_input("/tmp/file.txt").read_items(0, (2, int), (3, "S10", "Atom")) + + reads columns 1 (as floats, by default), 3 (as integers), and 4 (as strings of + max length of 10, which field is named "Atom") from the text file /tmp/file.txt, + while disregarding comments. + + If the tuple contains the third field, it is used as the name of the field; + otherwise the fields are named f0, f1, f2, .... + + Additional keyword options: + * deftype: default datatype + * maxcount: maximum number of records to be read + + TODO: Needs ability to read in complex data. + """ + deftype = kwd.get("deftype", float) + + # float_fields extracts the desired columns and converts them to floats + flds = [] + cols = [] + for (i,c) in zip(xrange(len(col_desc)), col_desc): + if type(c) == int: + cols.append(c) + flds.append(('f' + str(i), deftype)) + elif len(c) == 1: + cols.append(c[0]) + flds.append(('f' + str(i), deftype)) + elif len(c) == 2: + cols.append(c[0]) + flds.append(('f' + str(i), c[1])) + elif len(c) == 3: + cols.append(c[0]) + flds.append((c[2], c[1])) + + #print cols + #print flds + get_fields = lambda vals : tuple([ vals[col] for col in cols ]) + if "maxcount" in kwd: + #print "hello" + rslt = [ get_fields(vals.split()) for (c,vals) in zip(xrange(kwd['maxcount']),self) ] + else: + rslt = [ get_fields(vals.split()) for vals in self ] + #print rslt + # finally convert them to a numpy ndarray: + return numpy.array(rslt, dtype=flds) + + # Sets fancy options + def set_options(self, **opts): + for (o,v) in opts.iteritems(): + if o == "expand_errorbar": + self.expand_errorbar(v) + if o == "skip_blank_lines": + self.skip_blank_lines = v + else: + raise "ValueError", "Invalid option: %s" % (o,) + return self + + # Option for errorbar expansion: + def expand_errorbar(self, v=True): + '''Enables or disables errorbar expansion.''' + if v: + self.opt_expand_errorbar = True + self.field_filtering_proc = self.expand_errorbar_hook + else: + self.opt_expand_errorbar = False + self.field_filtering_proc = lambda flds : flds + return self + + def expand_errorbar_hook(self, F): + # A hook for field_filtering_proc for expanding errorbars: + from pyqmc.stats.errorbar import expand + return expand(F, flatten=True)