masashi-y · texttheater · May 24, 2017 · May 24, 2017 · May 24, 2017
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 Codebase for [A\* CCG Parsing with a Supertag and Dependency Factored Model](https://arxiv.org/abs/1704.06936)
 
 #### Requirements
-* Python (Either 2 or 3)
+* Python 2
 * [Chainer](http://chainer.org/) (newer versions)
 * [Cython](http://cython.org/)
 * A C++ compiler supporting [C++11 standard](https://en.wikipedia.org/wiki/C%2B%2B11)

diff --git a/src/py/ccgbank.py b/src/py/ccgbank.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+import codecs
 import re
 import os
 import py.cat
@@ -25,7 +26,7 @@ def walk_autodir(path, subset="train"):
 
 class AutoReader(object):
     def __init__(self, filename):
-        self.lines = open(filename).readlines()
+        self.lines = codecs.open(filename, encoding='UTF-8').readlines()
 
     def readall(self, suppress_error=False):
         # Inputs:
@@ -52,7 +53,7 @@ def readall(self, suppress_error=False):
 
 class AutoLineReader(object):
     def __init__(self, line):
-        self.line = line.encode("utf-8")
+        self.line = line
         self.index = 0
         self.word_id = -1
 
@@ -80,7 +81,7 @@ def next_node(self):
         elif self.line[self.index+2] == "T":
             return self.parse_tree
         else:
-            raise RuntimeError()
+            raise RuntimeError("AUTO parse error: expected string starting with ' <L' or ' <T', but got this string: '" + self.line[self.index:] + ' in line: ' + self.line)
 
     def parse_leaf(self):
         self.word_id += 1

diff --git a/src/py/lstm_parser.py b/src/py/lstm_parser.py
@@ -1,5 +1,6 @@
 
 from __future__ import print_function, unicode_literals
+import codecs
 import sys
 import random
 import numpy as np
@@ -69,7 +70,7 @@ def _traverse(self, tree):
     def _write(dct, out, comment_out_value=False):
         print("writing to", out.name, file=sys.stderr)
         for key, value in dct.items():
-            out.write(key.encode("utf-8") + " ")
+            out.write(key + " ")
             if comment_out_value:
                 out.write("# ")
             out.write(str(value) + "\n")
@@ -98,7 +99,7 @@ def _to_conll(self, out):
         for sent, tags, (cats, deps) in self.samples:
             for i, (w, t, c, d) in enumerate(zip(sent.split(" "), tags, cats, deps), 1):
                 out.write("{0}\t{1}\t{1}\t{2}\t{2}\t_\t{4}\tnone\t_\t{3}\n"
-                        .format(i, w.encode("utf-8"), t, c, d))
+                        .format(i, w, t, c, d))
             out.write("\n")
 
     def _create_samples(self, trees):
@@ -144,17 +145,17 @@ def create_traindata(args):
             self._write(self.seen_rules, f, comment_out_value=True)
         with open(args.out + "/target.txt", "w") as f:
             self._write(self.cats, f, comment_out_value=False)
-        with open(args.out + "/words.txt", "w") as f:
+        with codecs.open(args.out + "/words.txt", "w", encoding="UTF-8") as f:
             self._write(self.words, f, comment_out_value=False)
-        with open(args.out + "/suffixes.txt", "w") as f:
+        with codecs.open(args.out + "/suffixes.txt", "w", encoding='UTF-8') as f:
             self._write(self.suffixes, f, comment_out_value=False)
-        with open(args.out + "/prefixes.txt", "w") as f:
+        with codecs.open(args.out + "/prefixes.txt", "w", encoding='UTF-8') as f:
             self._write(self.prefixes, f, comment_out_value=False)
         with open(args.out + "/traindata.json", "w") as f:
             json.dump([(s, t) for (s, _, t) in self.samples], f) # no need for tags
-        with open(args.out + "/trainsents.txt", "w") as f:
-            for sent in self.sents: f.write(sent.encode("utf-8") + "\n")
-        with open(args.out + "/trainsents.conll", "w") as f:
+        with codecs.open(args.out + "/trainsents.txt", "w", encoding="UTF-8") as f:
+            for sent in self.sents: f.write(sent + "\n")
+        with codecs.open(args.out + "/trainsents.conll", "w", encoding="UTF-8") as f:
             self._to_conll(f)
 
     @staticmethod