From 8cf8cafcb66cbba8ea2fa949ecb96abb665cb7b2 Mon Sep 17 00:00:00 2001
From: liisaratsep <liisaratsep@gmail.com>
Date: Wed, 21 Jun 2017 20:21:02 +0300
Subject: [PATCH 1/2] Fixed testing (corrected 'tespath' -> 'testpath')

---
 barchybrid/src/parser.py | 8 ++++----
 bmstparser/src/parser.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/barchybrid/src/parser.py b/barchybrid/src/parser.py
index 5d0f75f..daf3aa6 100644
--- a/barchybrid/src/parser.py
+++ b/barchybrid/src/parser.py
@@ -73,16 +73,16 @@
         parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
         parser.Load(options.model)
         conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
-        tespath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu')
+        testpath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu')
         ts = time.time()
         pred = list(parser.Predict(options.conll_test))
         te = time.time()
-        utils.write_conll(tespath, pred)
+        utils.write_conll(testpath, pred)
 
         if not conllu:
-            os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath  + ' > ' + tespath + '.txt')
+            os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + testpath  + ' > ' + testpath + '.txt')
         else:
-            os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + testpath + '.txt')
+            os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + testpath + ' > ' + testpath + '.txt')
         
         print 'Finished predicting test',te-ts
 
diff --git a/bmstparser/src/parser.py b/bmstparser/src/parser.py
index 28ba87c..cf593ef 100644
--- a/bmstparser/src/parser.py
+++ b/bmstparser/src/parser.py
@@ -44,18 +44,18 @@
 
         parser.Load(options.model)
         conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
-        tespath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu')
+        testpath = os.path.join(options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu')
 
         ts = time.time()
         test_res = list(parser.Predict(options.conll_test))
         te = time.time()
         print 'Finished predicting test.', te-ts, 'seconds.'
-        utils.write_conll(tespath, test_res)
+        utils.write_conll(testpath, test_res)
 
         if not conllu:
-            os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath  + ' > ' + tespath + '.txt')
+            os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + testpath  + ' > ' + testpath + '.txt')
         else:
-            os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + testpath + '.txt')
+            os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + testpath + ' > ' + testpath + '.txt')
     else:
         print 'Preparing vocab'
         words, w2i, pos, rels = utils.vocab(options.conll_train)

From 3fc7e34e6dcf7c2497079bd5d985746844af4a42 Mon Sep 17 00:00:00 2001
From: liisaratsep <liisaratsep@gmail.com>
Date: Wed, 21 Jun 2017 23:23:25 +0300
Subject: [PATCH 2/2] Added support to use CPOS/UPOS for parsing

---
 barchybrid/src/arc_hybrid.py |  7 ++++++-
 barchybrid/src/parser.py     |  3 ++-
 barchybrid/src/utils.py      |  7 +++++--
 bmstparser/src/mstlstm.py    | 11 +++++++++--
 bmstparser/src/parser.py     |  3 ++-
 bmstparser/src/utils.py      |  7 +++++--
 6 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/barchybrid/src/arc_hybrid.py b/barchybrid/src/arc_hybrid.py
index d40e182..c70d7c8 100644
--- a/barchybrid/src/arc_hybrid.py
+++ b/barchybrid/src/arc_hybrid.py
@@ -32,6 +32,8 @@ def __init__(self, words, pos, rels, w2i, options):
         self.rlFlag = options.rlFlag
         self.k = options.window
 
+        self.cposFlag = options.cposFlag
+
         self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0)
 
         self.external_embedding = None
@@ -171,7 +173,10 @@ def getWordEmbeddings(self, sentence, train):
             c = float(self.wordsCount.get(root.norm, 0))
             dropFlag =  not train or (random.random() < (c/(0.25+c)))
             root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0]
-            root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None
+            if self.cposFlag:
+                root.posvec = self.plookup[int(self.pos[root.cpos])] if self.pdims > 0 else None
+            else:
+                root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None
 
             if self.external_embedding is not None:
                 #if not dropFlag and random.random() < 0.5:
diff --git a/barchybrid/src/parser.py b/barchybrid/src/parser.py
index daf3aa6..40712d4 100644
--- a/barchybrid/src/parser.py
+++ b/barchybrid/src/parser.py
@@ -31,6 +31,7 @@
     parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False)
     parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
     parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512)
+    parser.add_option("--cpos", action="store_true", help="To use CPOS/UPOS field instead of POS/XPOS", dest="cposFlag", default=False)
 
     (options, args) = parser.parse_args()
     print 'Using external embedding:', options.external_embedding
@@ -41,7 +42,7 @@
             sys.exit()
 
         print 'Preparing vocab'
-        words, w2i, pos, rels = utils.vocab(options.conll_train)
+        words, w2i, pos, rels = utils.vocab(options.conll_train, options.cposFlag)
 
         with open(os.path.join(options.output, options.params), 'w') as paramsfp:
             pickle.dump((words, w2i, pos, rels, options), paramsfp)
diff --git a/barchybrid/src/utils.py b/barchybrid/src/utils.py
index 3629151..1a1832e 100644
--- a/barchybrid/src/utils.py
+++ b/barchybrid/src/utils.py
@@ -69,7 +69,7 @@ def isProj(sentence):
     return len(forest.roots) == 1
 
 
-def vocab(conll_path):
+def vocab(conll_path, cposFlag):
     wordsCount = Counter()
     posCount = Counter()
     relCount = Counter()
@@ -77,7 +77,10 @@ def vocab(conll_path):
     with open(conll_path, 'r') as conllFP:
         for sentence in read_conll(conllFP, True):
             wordsCount.update([node.norm for node in sentence if isinstance(node, ConllEntry)])
-            posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)])
+            if cposFlag:
+                posCount.update([node.cpos for node in sentence if isinstance(node, ConllEntry)])
+            else:
+                posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)])
             relCount.update([node.relation for node in sentence if isinstance(node, ConllEntry)])
 
     return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())
diff --git a/bmstparser/src/mstlstm.py b/bmstparser/src/mstlstm.py
index 2fa2209..a0642f8 100644
--- a/bmstparser/src/mstlstm.py
+++ b/bmstparser/src/mstlstm.py
@@ -30,6 +30,7 @@ def __init__(self, vocab, pos, rels, w2i, options):
         self.rels = {word: ind for ind, word in enumerate(rels)}
         self.irels = rels
 
+        self.cposFlag = options.cposFlag
 
         self.external_embedding, self.edim = None, 0
         if options.external_embedding is not None:
@@ -146,7 +147,10 @@ def Predict(self, conll_path):
 
                 for entry in conll_sentence:
                     wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
-                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+                    if self.cposFlag:
+                        posvec = self.plookup[int(self.pos[entry.cpos])] if self.pdims > 0 else None
+                    else:
+                        posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
                     evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None
                     entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))
 
@@ -234,7 +238,10 @@ def Train(self, conll_path):
                     c = float(self.wordsCount.get(entry.norm, 0))
                     dropFlag = (random.random() < (c/(0.25+c)))
                     wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
-                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+                    if self.cposFlag:
+                        posvec = self.plookup[int(self.pos[entry.cpos])] if self.pdims > 0 else None
+                    else:
+                        posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
                     evec = None
 
                     if self.external_embedding is not None:
diff --git a/bmstparser/src/parser.py b/bmstparser/src/parser.py
index cf593ef..6eb3645 100644
--- a/bmstparser/src/parser.py
+++ b/bmstparser/src/parser.py
@@ -28,6 +28,7 @@
     parser.add_option("--disablecostaug", action="store_false", dest="costaugFlag", default=True)
     parser.add_option("--dynet-seed", type="int", dest="seed", default=0)
     parser.add_option("--dynet-mem", type="int", dest="mem", default=0)
+    parser.add_option("--cpos", action="store_true", help="To use CPOS/UPOS field instead of POS/XPOS", dest="cposFlag", default=False)
 
     (options, args) = parser.parse_args()
 
@@ -58,7 +59,7 @@
             os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + testpath + ' > ' + testpath + '.txt')
     else:
         print 'Preparing vocab'
-        words, w2i, pos, rels = utils.vocab(options.conll_train)
+        words, w2i, pos, rels = utils.vocab(options.conll_train, options.cposFlag)
 
         with open(os.path.join(options.output, options.params), 'w') as paramsfp:
             pickle.dump((words, w2i, pos, rels, options), paramsfp)
diff --git a/bmstparser/src/utils.py b/bmstparser/src/utils.py
index e9efff7..7ebcf04 100644
--- a/bmstparser/src/utils.py
+++ b/bmstparser/src/utils.py
@@ -25,7 +25,7 @@ def __str__(self):
         return '\t'.join(['_' if v is None else v for v in values])
 
 
-def vocab(conll_path):
+def vocab(conll_path, cposFlag):
     wordsCount = Counter()
     posCount = Counter()
     relCount = Counter()
@@ -33,7 +33,10 @@ def vocab(conll_path):
     with open(conll_path, 'r') as conllFP:
         for sentence in read_conll(conllFP):
             wordsCount.update([node.norm for node in sentence if isinstance(node, ConllEntry)])
-            posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)])
+            if cposFlag:
+                posCount.update([node.cpos for node in sentence if isinstance(node, ConllEntry)])
+            else:
+                posCount.update([node.pos for node in sentence if isinstance(node, ConllEntry)])
             relCount.update([node.relation for node in sentence if isinstance(node, ConllEntry)])
 
     return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())