BUG: Make python2 *.npy files readable in python3.

charris · charris · commit 8f068b7866a2 · 2014-10-12T17:40:49.000-06:00
The Python2 generated file had long integer literals like '1L' that broke in Python3. The fix here is to filter out the 'L' and let safe_eval take care of the integer type in converting the string. The fix here comes from Nathaniel Smith with a few added fixups. Closes numpy#5170.
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
@@ -141,7 +141,7 @@
 import io
 import warnings
 from numpy.lib.utils import safe_eval
-from numpy.compat import asbytes, isfileobj, long, basestring
+from numpy.compat import asbytes, asstr, isfileobj, long, basestring
 
 if sys.version_info[0] >= 3:
     import pickle
@@ -410,6 +410,45 @@ def read_array_header_2_0(fp):
     """
     _read_array_header(fp, version=(2, 0))
 
+
+def _filter_header(s):
+    """Clean up 'L' in npz header ints.
+
+    Cleans up the 'L' in strings representing integers. Needed to allow npz
+    headers produced in Python2 to be read in Python3.
+
+    Parameters
+    ----------
+    s : byte string
+        Npy file header.
+
+    Returns
+    -------
+    header : str
+        Cleaned up header.
+
+    """
+    import tokenize
+    if sys.version_info[0] >= 3:
+        from io import StringIO
+    else:
+        from StringIO import StringIO
+ 
+    tokens = []
+    last_token_was_number = False
+    for token in tokenize.generate_tokens(StringIO(asstr(s)).read):
+        token_type = token[0]
+        token_string = token[1]
+        if (last_token_was_number and
+                token_type == tokenize.NAME and
+                token_string == "L"):
+            continue
+        else:
+            tokens.append(token)
+        last_token_was_number = (token_type == tokenize.NUMBER)
+    return tokenize.untokenize(tokens)
+
+ 
 def _read_array_header(fp, version):
     """
     see read_array_header_1_0
@@ -434,6 +473,7 @@ def _read_array_header(fp, version):
     #   "shape" : tuple of int
     #   "fortran_order" : bool
     #   "descr" : dtype.descr
+    header = _filter_header(header)
     try:
         d = safe_eval(header)
     except SyntaxError as e: