From d7a65386a37936876061beff3c44f9e1b9f203c9 Mon Sep 17 00:00:00 2001
From: Wirawan Purwanto <wirawan0@gmail.com>
Date: Tue, 19 Aug 2014 11:06:08 -0400
Subject: [PATCH] * Fortran binary file: Added bulk_read_array1 method for
 quick reading of   array of (uniform-type) Fortran records.

---
 iofmt/fortbin.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/iofmt/fortbin.py b/iofmt/fortbin.py
index 7e954d3..bfde2e8 100644
--- a/iofmt/fortbin.py
+++ b/iofmt/fortbin.py
@@ -116,6 +116,7 @@ class fortran_bin_file(object):
 
   def byte_length(self, *fields):
     """Given a list of field descriptors, determine how many bytes this
+    set of fields would occupy.
     """
     expected_len = sum([ self.fld_count(f) * numpy.dtype(f[1]).itemsize
                            for f in fields ])
@@ -194,6 +195,41 @@ class fortran_bin_file(object):
 
     return rslt
 
+  def bulk_read_array1(self, dtype, shape):
+    """Reads data that is regularly stored as an array of Fortran records
+    (all of the same type and length).
+    Each record must be 'read' individually and validated if the record lengths
+    are indeed correct.
+    But this routine will bulk-read all the records at once, and shape it
+    into an array with that format.
+
+    Warning: because we load all the leading and trailing reclen markers, the array
+    will be larger than the actual size of the data, and the memory will not be
+    contiguous.
+    Use copy_subarray below to create the contiguous representation of the data
+    (per field name).
+    """
+    from numpy import product, fromfile, all
+    dtype1 = numpy.dtype([('reclen', self.record_marker_type),
+                          ('content', dtype),
+                          ('reclen2', self.record_marker_type)])
+
+    dtype_itemsize = dtype1['content'].itemsize
+
+    size = product(shape) # total number of elements to read in bulk
+    # reads in *ALL* the records in a linear fashion, in one read stmt
+    arr = fromfile(self.F, dtype1, size)
+
+    if not all(arr['reclen'] == dtype_itemsize) \
+       or not all(arr['reclen2'] == dtype_itemsize):
+      raise IOError, \
+        (("Inconsistency detected in record array: " \
+          "one or more records do not have the expected record length=%d") \
+         % (dtype_itemsize,))
+
+    # Returns only the content--this WILL NOT be contiguous in memory.
+    return arr['content'].reshape(shape, order='F')
+
   def write_vals(self, *vals, **opts):
     """Writes a Fortran record.
     Only values need to be given, because the types are known.
@@ -315,3 +351,27 @@ def array_major_dim(arr):
         "Unable to determine whether this is a row or column major object."
 
 
+def copy_subarray(arr, key, order='F'):
+  """Given a numpy array of structured datatype, copy out a subarray field
+  into a new array with contiguous format.
+  The field accessed by arr[key] must be a fixed-size array.
+  The order argument can be either 'F' or 'C':
+  - For 'F' ordering, then the subarray index will become the *first* index.
+  - For 'C' ordering, then the subarray index will become the *last* index.
+  """
+  subarr = arr[key]
+  dim = len(arr.shape)
+  subdim = len(subarr.shape) - dim
+  if order == 'F':
+    rslt = numpy.transpose(subarr, axes=list(range(dim, subdim+dim) + range(dim)))
+  elif order == 'C':
+    rslt = subarr
+  else:
+    raise ValueError, 'Invalid order argument'
+  # Always return a copy!
+  if numpy.may_share_memory(rslt, arr):
+    return rslt.copy(order=order)
+  else:
+    return rslt
+
+