Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for model creation #2

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Codebase for [A\* CCG Parsing with a Supertag and Dependency Factored Model](https://arxiv.org/abs/1704.06936)

#### Requirements
* Python (Either 2 or 3)
* Python 2
* [Chainer](http://chainer.org/) (newer versions)
* [Cython](http://cython.org/)
* A C++ compiler supporting [C++11 standard](https://en.wikipedia.org/wiki/C%2B%2B11)
Expand Down
7 changes: 4 additions & 3 deletions src/py/ccgbank.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import codecs
import re
import os
import py.cat
Expand All @@ -25,7 +26,7 @@ def walk_autodir(path, subset="train"):

class AutoReader(object):
def __init__(self, filename):
self.lines = open(filename).readlines()
self.lines = codecs.open(filename, encoding='UTF-8').readlines()

def readall(self, suppress_error=False):
# Inputs:
Expand All @@ -52,7 +53,7 @@ def readall(self, suppress_error=False):

class AutoLineReader(object):
def __init__(self, line):
self.line = line.encode("utf-8")
self.line = line
self.index = 0
self.word_id = -1

Expand Down Expand Up @@ -80,7 +81,7 @@ def next_node(self):
elif self.line[self.index+2] == "T":
return self.parse_tree
else:
raise RuntimeError()
raise RuntimeError("AUTO parse error: expected string starting with ' <L' or ' <T', but got this string: '" + self.line[self.index:] + ' in line: ' + self.line)

def parse_leaf(self):
self.word_id += 1
Expand Down
17 changes: 9 additions & 8 deletions src/py/lstm_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

from __future__ import print_function, unicode_literals
import codecs
import sys
import random
import numpy as np
Expand Down Expand Up @@ -69,7 +70,7 @@ def _traverse(self, tree):
def _write(dct, out, comment_out_value=False):
print("writing to", out.name, file=sys.stderr)
for key, value in dct.items():
out.write(key.encode("utf-8") + " ")
out.write(key + " ")
if comment_out_value:
out.write("# ")
out.write(str(value) + "\n")
Expand Down Expand Up @@ -98,7 +99,7 @@ def _to_conll(self, out):
for sent, tags, (cats, deps) in self.samples:
for i, (w, t, c, d) in enumerate(zip(sent.split(" "), tags, cats, deps), 1):
out.write("{0}\t{1}\t{1}\t{2}\t{2}\t_\t{4}\tnone\t_\t{3}\n"
.format(i, w.encode("utf-8"), t, c, d))
.format(i, w, t, c, d))
out.write("\n")

def _create_samples(self, trees):
Expand Down Expand Up @@ -144,17 +145,17 @@ def create_traindata(args):
self._write(self.seen_rules, f, comment_out_value=True)
with open(args.out + "/target.txt", "w") as f:
self._write(self.cats, f, comment_out_value=False)
with open(args.out + "/words.txt", "w") as f:
with codecs.open(args.out + "/words.txt", "w", encoding="UTF-8") as f:
self._write(self.words, f, comment_out_value=False)
with open(args.out + "/suffixes.txt", "w") as f:
with codecs.open(args.out + "/suffixes.txt", "w", encoding='UTF-8') as f:
self._write(self.suffixes, f, comment_out_value=False)
with open(args.out + "/prefixes.txt", "w") as f:
with codecs.open(args.out + "/prefixes.txt", "w", encoding='UTF-8') as f:
self._write(self.prefixes, f, comment_out_value=False)
with open(args.out + "/traindata.json", "w") as f:
json.dump([(s, t) for (s, _, t) in self.samples], f) # no need for tags
with open(args.out + "/trainsents.txt", "w") as f:
for sent in self.sents: f.write(sent.encode("utf-8") + "\n")
with open(args.out + "/trainsents.conll", "w") as f:
with codecs.open(args.out + "/trainsents.txt", "w", encoding="UTF-8") as f:
for sent in self.sents: f.write(sent + "\n")
with codecs.open(args.out + "/trainsents.conll", "w", encoding="UTF-8") as f:
self._to_conll(f)

@staticmethod
Expand Down