From 8cf8cafcb66cbba8ea2fa949ecb96abb665cb7b2 Mon Sep 17 00:00:00 2001 From: liisaratsep Date: Wed, 21 Jun 2017 20:21:02 +0300 Subject: [PATCH 1/2] Fixed testing (corrected 'tespath' -> 'testpath') --- barchybrid/src/parser.py | 8 ++++---- bmstparser/src/parser.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/barchybrid/src/parser.py b/barchybrid/src/parser.py index 5d0f75f..daf3aa6 100644 --- a/barchybrid/src/parser.py +++ b/barchybrid/src/parser.py @@ -73,16 +73,16 @@ parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt) parser.Load(options.model) conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') - tespath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') + testpath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') ts = time.time() pred = list(parser.Predict(options.conll_test)) te = time.time() - utils.write_conll(tespath, pred) + utils.write_conll(testpath, pred) if not conllu: - os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') + os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + testpath + ' > ' + testpath + '.txt') else: - os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + testpath + '.txt') + os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + testpath + ' > ' + testpath + '.txt') print 'Finished predicting test',te-ts diff --git a/bmstparser/src/parser.py b/bmstparser/src/parser.py index 28ba87c..cf593ef 100644 --- a/bmstparser/src/parser.py +++ b/bmstparser/src/parser.py @@ -44,18 +44,18 @@ parser.Load(options.model) conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') - tespath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') + testpath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') ts = time.time() test_res = list(parser.Predict(options.conll_test)) te = time.time() print 'Finished predicting test.', te-ts, 'seconds.' - utils.write_conll(tespath, test_res) + utils.write_conll(testpath, test_res) if not conllu: - os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') + os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + testpath + ' > ' + testpath + '.txt') else: - os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + testpath + '.txt') + os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + testpath + ' > ' + testpath + '.txt') else: print 'Preparing vocab' words, w2i, pos, rels = utils.vocab(options.conll_train) From 3fc7e34e6dcf7c2497079bd5d985746844af4a42 Mon Sep 17 00:00:00 2001 From: liisaratsep Date: Wed, 21 Jun 2017 23:23:25 +0300 Subject: [PATCH 2/2] Added support to use CPOS/UPOS for parsing --- barchybrid/src/arc_hybrid.py | 7 ++++++- barchybrid/src/parser.py | 3 ++- barchybrid/src/utils.py | 7 +++++-- bmstparser/src/mstlstm.py | 11 +++++++++-- bmstparser/src/parser.py | 3 ++- bmstparser/src/utils.py | 7 +++++-- 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/barchybrid/src/arc_hybrid.py b/barchybrid/src/arc_hybrid.py index d40e182..c70d7c8 100644 --- a/barchybrid/src/arc_hybrid.py +++ b/barchybrid/src/arc_hybrid.py @@ -32,6 +32,8 @@ def __init__(self, words, pos, rels, w2i, options): self.rlFlag = options.rlFlag self.k = options.window + self.cposFlag = options.cposFlag + self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) self.external_embedding = None @@ -171,7 +173,10 @@ def getWordEmbeddings(self, sentence, train): c = float(self.wordsCount.get(root.norm, 0)) dropFlag = not train or (random.random() < (c/(0.25+c))) root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0] - root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None + if self.cposFlag: + root.posvec = self.plookup[int(self.pos[root.cpos])] if self.pdims > 0 else None + else: + root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None if self.external_embedding is not None: #if not dropFlag and random.random() < 0.5: diff --git a/barchybrid/src/parser.py b/barchybrid/src/parser.py index daf3aa6..40712d4 100644 --- a/barchybrid/src/parser.py +++ b/barchybrid/src/parser.py @@ -31,6 +31,7 @@ parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False) parser.add_option("--predict", action="store_true", dest="predictFlag", default=False) parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512) + parser.add_option("--cpos", action="store_true", help="To use CPOS/UPOS field instead of POS/XPOS", dest="cposFlag", default=False) (options, args) = parser.parse_args() print 'Using external embedding:', options.external_embedding @@ -41,7 +42,7 @@ sys.exit() print 'Preparing vocab' - words, w2i, pos, rels = utils.vocab(options.conll_train) + words, w2i, pos, rels = utils.vocab(options.conll_train, options.cposFlag) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) diff --git a/barchybrid/src/utils.py b/barchybrid/src/utils.py index 3629151..1a1832e 100644 --- a/barchybrid/src/utils.py +++ b/barchybrid/src/utils.py @@ -69,7 +69,7 @@ def isProj(sentence): return len(forest.roots) == 1 -def vocab(conll_path): +def vocab(conll_path, cposFlag): wordsCount = Counter() posCount = Counter() relCount = Counter() @@ -77,7 +77,10 @@ def vocab(conll_path): with open(conll_path, 'r') as conllFP: for sentence in read_conll(conllFP, True): wordsCount.update([node.norm for node in sentence if isinstance(node, ConllEntry)]) - posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)]) + if cposFlag: + posCount.update([node.cpos for node in sentence if isinstance(node, ConllEntry)]) + else: + posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)]) relCount.update([node.relation for node in sentence if isinstance(node, ConllEntry)]) return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys()) diff --git a/bmstparser/src/mstlstm.py b/bmstparser/src/mstlstm.py index 2fa2209..a0642f8 100644 --- a/bmstparser/src/mstlstm.py +++ b/bmstparser/src/mstlstm.py @@ -30,6 +30,7 @@ def __init__(self, vocab, pos, rels, w2i, options): self.rels = {word: ind for ind, word in enumerate(rels)} self.irels = rels + self.cposFlag = options.cposFlag self.external_embedding, self.edim = None, 0 if options.external_embedding is not None: @@ -146,7 +147,10 @@ def Predict(self, conll_path): for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None - posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None + if self.cposFlag: + posvec = self.plookup[int(self.pos[entry.cpos])] if self.pdims > 0 else None + else: + posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) @@ -234,7 +238,10 @@ def Train(self, conll_path): c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c/(0.25+c))) wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None - posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None + if self.cposFlag: + posvec = self.plookup[int(self.pos[entry.cpos])] if self.pdims > 0 else None + else: + posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None evec = None if self.external_embedding is not None: diff --git a/bmstparser/src/parser.py b/bmstparser/src/parser.py index cf593ef..6eb3645 100644 --- a/bmstparser/src/parser.py +++ b/bmstparser/src/parser.py @@ -28,6 +28,7 @@ parser.add_option("--disablecostaug", action="store_false", dest="costaugFlag", default=True) parser.add_option("--dynet-seed", type="int", dest="seed", default=0) parser.add_option("--dynet-mem", type="int", dest="mem", default=0) + parser.add_option("--cpos", action="store_true", help="To use CPOS/UPOS field instead of POS/XPOS", dest="cposFlag", default=False) (options, args) = parser.parse_args() @@ -58,7 +59,7 @@ os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + testpath + ' > ' + testpath + '.txt') else: print 'Preparing vocab' - words, w2i, pos, rels = utils.vocab(options.conll_train) + words, w2i, pos, rels = utils.vocab(options.conll_train, options.cposFlag) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) diff --git a/bmstparser/src/utils.py b/bmstparser/src/utils.py index e9efff7..7ebcf04 100644 --- a/bmstparser/src/utils.py +++ b/bmstparser/src/utils.py @@ -25,7 +25,7 @@ def __str__(self): return '\t'.join(['_' if v is None else v for v in values]) -def vocab(conll_path): +def vocab(conll_path, cposFlag): wordsCount = Counter() posCount = Counter() relCount = Counter() @@ -33,7 +33,10 @@ def vocab(conll_path): with open(conll_path, 'r') as conllFP: for sentence in read_conll(conllFP): wordsCount.update([node.norm for node in sentence if isinstance(node, ConllEntry)]) - posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)]) + if cposFlag: + posCount.update([node.cpos for node in sentence if isinstance(node, ConllEntry)]) + else: + posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)]) relCount.update([node.relation for node in sentence if isinstance(node, ConllEntry)]) return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())