From 606de016ab85786cf821457c13eead045aa9c2e8 Mon Sep 17 00:00:00 2001
From: Hai Liang Wang <hailiang.hl.wang@gmail.com>
Date: Fri, 16 Mar 2018 18:46:43 +0800
Subject: [PATCH] Enable train & test with yoavg's code

---
 .gitignore                  |   1 -
 README.md                   |  29 +++-
 admin/test.sh               |  18 +++
 admin/{dev.sh => train.sh}  |   6 +-
 app/README                  |  43 -----
 app/eager.py                | 305 +++++++++++++++++++++---------------
 app/pio/io.py               |  36 ++++-
 {app => data}/conll.example |   0
 tmp/.gitignore              |   2 +
 9 files changed, 261 insertions(+), 179 deletions(-)
 create mode 100755 admin/test.sh
 rename admin/{dev.sh => train.sh} (60%)
 delete mode 100644 app/README
 rename {app => data}/conll.example (100%)
 create mode 100644 tmp/.gitignore

diff --git a/.gitignore b/.gitignore
index d1ac3e2..da62e48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,6 @@
 *.pyc
 jmeter.log
 __pycache__
-tmp/
 node_modules/
 sftp-config.json
 .DS_Store
diff --git a/README.md b/README.md
index 596a552..3680cda 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,12 @@
 # text-dependency-parser
 依存关系分析
 
+![](https://camo.githubusercontent.com/ae91a5698ad80d3fe8e0eb5a4c6ee7170e088a7d/687474703a2f2f37786b6571692e636f6d312e7a302e676c622e636c6f7564646e2e636f6d2f61692f53637265656e25323053686f74253230323031372d30342d30342532306174253230382e32302e3437253230504d2e706e67)
+
 ## Data
 format: [CoNLL-2009 Shared Task](http://ufal.mff.cuni.cz/conll2009-st/task-description.html)
 
-### universaldependencies
+### Universal Dependencies
 http://universaldependencies.org/
 
 ### 采用清华大学语义依存网络语料的20000句作为训练集。
@@ -13,5 +15,26 @@ http://www.hankcs.com/nlp/corpus/chinese-treebank.html#h3-6
 ### 汉语树库
 http://www.hankcs.com/nlp/corpus/chinese-treebank.html
 
-### Transition-based dependency parsers
-https://www.cs.bgu.ac.il/~yoavg/software/transitionparser/
\ No newline at end of file
+## Run
+
+Train model.
+
+```
+admin/train.sh
+```
+
+Test model.
+
+```
+admin/test.sh
+```
+
+# Give credits to
+
+[Transition Based Dependency Parsers](https://www.cs.bgu.ac.il/~yoavg/software/transitionparser/)
+
+References:
+~~~~~~~~~~~
+[1] Liang Huang, Wenbin Jiang and Qun Liu. 2009.
+    Bilingually-Constrained (Monolingual) Shift-Reduce Parsing.
+    
\ No newline at end of file
diff --git a/admin/test.sh b/admin/test.sh
new file mode 100755
index 0000000..4c0c5ae
--- /dev/null
+++ b/admin/test.sh
@@ -0,0 +1,18 @@
+#! /bin/bash 
+###########################################
+#
+###########################################
+
+# constants
+baseDir=$(cd `dirname "$0"`;pwd)
+# functions
+
+# main 
+[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
+cd $baseDir/../app
+python eager.py \
+    --verbosity=1 \
+    --test=True \
+    --model=$baseDir/../tmp/eager.model \
+    --test_data=$baseDir/../data/UD_English-EWT/en-ud-test.conllu \
+    --test_results=$baseDir/../tmp/en-ud-test.results \
\ No newline at end of file
diff --git a/admin/dev.sh b/admin/train.sh
similarity index 60%
rename from admin/dev.sh
rename to admin/train.sh
index 7460d99..59f294c 100755
--- a/admin/dev.sh
+++ b/admin/train.sh
@@ -10,4 +10,8 @@ baseDir=$(cd `dirname "$0"`;pwd)
 # main 
 [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
 cd $baseDir/../app
-python parser.py Test.test_UD_English_EWT
\ No newline at end of file
+python eager.py \
+    --verbosity=1 \
+    --train=True \
+    --train_data=$baseDir/../data/UD_English-EWT/en-ud-train.conllu \
+    --model=$baseDir/../tmp/eager.model \
\ No newline at end of file
diff --git a/app/README b/app/README
deleted file mode 100644
index 5644bc2..0000000
--- a/app/README
+++ /dev/null
@@ -1,43 +0,0 @@
-Transition Based Dependency Parsers
-
-These are implementations of the (unlabeled) arc-eager and arc-standard dependency parsing algorithms.
-These parsers are very fast and are reasonably accurate.
-In particular, the arc-standard parser with the features described in [1] (the default feature set) can achieve very competitive accuracies.
-
-The input file for both training and parsing should be in CoNLL format (see conll.example).
-Columns 8,9,10 are always ignored (but must be present).
-When parsing new text, you can put whatever you want in column 7, the parser will overwrite it (it uses this column to report accuracy scores)
-
-Compiling:
-==========
-Speed is achieved using a c/cython extension module. 
-This needs to be compiled using either cython or a c compiler.
-See instructions in ml/README
-
-Training the parsers:
-=====================
-
-   ./eager.py -o model_file [options] conll_input_file
-
-   or
-
-   ./standard.py -o model_file [options] conll_input_file
-
-   (use -f instead of -o to create feature vector files for training with an external classifier.  If you don't know what it means,
-    just ignore this option.  The model file format is the same as Megam's.)
-
-Parsing new text with the trained model:
-========================================
-
-   ./eager.py -m model_file [options] conll_file_to_parse > output
-
-   or 
-
-   ./standard.py -m model_file [options] conll_file_to_parse > output
-
-
-References:
-~~~~~~~~~~~
-[1] Liang Huang, Wenbin Jiang and Qun Liu. 2009.
-    Bilingually-Constrained (Monolingual) Shift-Reduce Parsing.
-
diff --git a/app/eager.py b/app/eager.py
index 21a245c..6f2a314 100755
--- a/app/eager.py
+++ b/app/eager.py
@@ -18,134 +18,193 @@
 
 Author: Yoav Goldberg (yoav.goldberg@gmail.com)
 """
-from features import extractors 
-from params import parser
+from __future__ import print_function
+from __future__ import division
 
-opts, args = parser.parse_args()
-
-if opts.trainfile:
-   MODE='train'
-   TRAIN_OUT_FILE=opts.trainfile
-elif opts.externaltrainfile:
-   MODE='write'
-   TRAIN_OUT_FILE=opts.externaltrainfile
-else:
-   MODE='test'
-
-if opts.SCORES_OUT:
-   scores_out = file("eager.scores","w")
+import os
+import sys
+curdir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(curdir)
 
-DATA_FILE=args[0]
+if sys.version_info[0] < 3:
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+    # raise "Must be using Python 3"
 
-########
+from absl import app
+from absl import flags
+from absl import logging
 
-import sys
 from ml import ml
-
 from pio import io
 from transitionparser import *
-
-featExt = extractors.get(opts.feature_extarctor)
-
-sents = list(io.conll_to_sents(file(DATA_FILE)))
-
-if opts.only_proj:
-   import isprojective
-   sents = [s for s in sents if isprojective.is_projective(s)]
-
-if opts.UNLEX:
-   from shared.lemmatize import EnglishMinimalWordSmoother
-   smoother = EnglishMinimalWordSmoother.from_words_file("1000words")
-   for sent in sents:
-      for tok in sent:
-         tok['oform']=tok['form']
-         tok['form'] = smoother.get(tok['form'])
-
-if MODE=="write":
-   fout = file(TRAIN_OUT_FILE,"w")
-   trainer = LoggingActionDecider(ArcEagerParsingOracle(pop_when_can=opts.POP_WHEN_CAN),featExt,fout)
-   p = ArcEagerParser( trainer)
-   for i,sent in enumerate(sents):
-      sys.stderr.write(". %s " % i)
-      sys.stderr.flush()
-      d=p.parse(sent)
-   sys.exit()
-
-
-if MODE=="train":
-   fout = file(TRAIN_OUT_FILE,"w")
-   nactions = 4
-   trainer = MLTrainerActionDecider(ml.MultitronParameters(nactions), ArcEagerParsingOracle(pop_when_can=opts.POP_WHEN_CAN), featExt)
-   p = ArcEagerParser( trainer)
-   import random
-   random.seed("seed")
-   #random.shuffle(sents)
-   for x in xrange(10):
-      print "iter ",x
-      for i,sent in enumerate(sents):
-         if i % 500 == 0: print i,
-         try:
-            d=p.parse(sent)
-         except IndexError,e:
-            print "prob in sent:",i
-            print "\n".join(["%s %s %s %s" % (t['id'],t['form'],t['tag'],t['parent']) for t in sent])
-            raise e
-   trainer.save(fout)
-   sys.exit()
-# test
-elif MODE=="test":
-   p = ArcEagerParser(MLActionDecider(ml.MulticlassModel(opts.modelfile),featExt))
-
-good = 0.0
-bad  = 0.0
-complete=0.0
-
-#main test loop
-reals = set()
-preds = set()
-
-for i,sent in enumerate(sents):
-   sgood=0.0
-   sbad=0.0
-   mistake=False
-   sys.stderr.write("%s %s %s\n"% ( "@@@",i,good/(good+bad+1)))
-   try:
-      d=p.parse(sent)
-   except MLTrainerWrongActionException:
-      # this happens only in "early update" parsers, and then we just go on to
-      # the next sentence..
-      continue
-   sent = d.annotate_allow_none(sent)
-   for tok in sent:
-      if opts.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue
-      reals.add((i,tok['parent'],tok['id']))
-      preds.add((i,tok['pparent'],tok['id']))
-      if tok['pparent']==-1:continue
-      if tok['parent']==tok['pparent'] or tok['pparent']==-1:
-         good+=1
-         sgood+=1
-      else:
-         bad+=1
-         sbad+=1
-         mistake=True
-   #print 
-   if opts.UNLEX:
-      io.out_conll(sent,parent='pparent',form='oform')
-   else:
-      io.out_conll(sent,parent='pparent',form='form')
-   if not mistake: complete+=1
-   #sys.exit()
-   if opts.SCORES_OUT:
-      scores_out.write("%s\n" % (sgood/(sgood+sbad)))
+from features import extractors
+
+FLAGS = flags.FLAGS
+'''
+General
+'''
+
+flags.DEFINE_boolean('ignore_punc', False, 'Ignore Punct File.')
+flags.DEFINE_boolean('only_projective', False, 'Only Projective.')
+flags.DEFINE_boolean('lazypop', True, 'Lazy pop.')
+flags.DEFINE_boolean('unlex', False, 'unlex')
+flags.DEFINE_string('feature_extarctor', 'eager.zhang', 'Feature Extarctor')
+flags.DEFINE_string('model', os.path.join(curdir, os.path.pardir, "tmp", "eager.model"), 'Transition Parser Model.')
+
+'''
+Train
+'''
+flags.DEFINE_boolean('train', False, 'Train model with train data')
+flags.DEFINE_integer('epoch', 1, 'Train Epoch.')
+flags.DEFINE_string('train_data', os.path.join(curdir, os.path.pardir, "data", "conll.example"), 'Train Data')
+
+flags.DEFINE_string('externaltrainfile', None, 'External Train File.')
+# flags.DEFINE_string('modelfile', 'data/weights', 'Model File.')
+
+'''
+Test
+'''
+flags.DEFINE_boolean('test', False, 'Evalutate with test data')
+flags.DEFINE_string('test_data', os.path.join(curdir, os.path.pardir, "data", "conll.example"), 'Test data.')
+flags.DEFINE_string('test_results', os.path.join(curdir, os.path.pardir, "tmp", "eager.test.results"), 'Save scores into disk.')
+
+def transform_conll_sents(conll_file_path):
+    '''
+    Transform CoNLL data as feeding
+    '''
+    sents = list(io.conll_to_sents(file(conll_file_path)))
+
+    if FLAGS.only_projective:
+       import isprojective
+       sents = [s for s in sents if isprojective.is_projective(s)]
+
+    if FLAGS.unlex:
+       from shared.lemmatize import EnglishMinimalWordSmoother
+       smoother = EnglishMinimalWordSmoother.from_words_file("1000words")
+       for sent in sents:
+          for tok in sent:
+             tok['oform']=tok['form']
+             tok['form'] = smoother.get(tok['form'])
+
+    return sents
+
+def test():
+    '''
+    Test Model
+    '''
+    logging.info("test ...")
+    featExt = extractors.get(FLAGS.feature_extarctor)
+    p = ArcEagerParser(MLActionDecider(ml.MulticlassModel(FLAGS.model), featExt))
+
+    good = 0.0
+    bad  = 0.0
+    complete = 0.0
+
+    #main test loop
+    reals = set()
+    preds = set()
+    with open(FLAGS.test_data, "r") as fin, open(FLAGS.test_results, "w") as fout:
+        sents = transform_conll_sents(FLAGS.test_data)
+        for i,sent in enumerate(sents):
+           sgood=0.0
+           sbad=0.0
+           mistake=False
+           sys.stderr.write("%s %s %s\n"% ( "@@@",i,good/(good+bad+1)))
+           try:
+              d=p.parse(sent)
+           except MLTrainerWrongActionException:
+              # this happens only in "early update" parsers, and then we just go on to
+              # the next sentence..
+              continue
+           sent = d.annotate_allow_none(sent)
+           for tok in sent:
+              if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue
+              reals.add((i,tok['parent'],tok['id']))
+              preds.add((i,tok['pparent'],tok['id']))
+              if tok['pparent']==-1:continue
+              if tok['parent']==tok['pparent'] or tok['pparent']==-1:
+                 good+=1
+                 sgood+=1
+              else:
+                 bad+=1
+                 sbad+=1
+                 mistake=True
+           #print 
+           if FLAGS.unlex:
+              io.out_conll(sent,parent='pparent',form='oform')
+           else:
+              io.out_conll(sent,parent='pparent',form='form')
+           if not mistake: complete+=1
+           #sys.exit()
+           logging.info("test result: sgood[%s], sbad[%s]", sgood, sbad)
+           if sgood > 0.0 and sbad > 0.0:
+               fout.write("%s\n" % (sgood/(sgood+sbad)))
+        
+        logging.info("accuracy: %s", good/(good+bad))
+        logging.info("complete: %s", complete/len(sents))
+        preds = set([(i,p,c) for i,p,c in preds if p != -1])
+        logging.info("recall: %s", len(preds.intersection(reals))/float(len(reals)))
+        logging.info("precision: %s", len(preds.intersection(reals))/float(len(preds)))
+        logging.info("assigned: %s",len(preds)/float(len(reals)))
+
+def train():
+    '''
+    Train Model
+    '''
+    if FLAGS.model:
+       MODE='train'
+       TRAIN_OUT_FILE=FLAGS.model
+    elif FLAGS.externaltrainfile:
+       '''
+       create feature vector files for training with an external classifier.  If you don't know what it means,
+        just ignore this option.  The model file format is the same as Megam's.
+       '''
+       MODE='write'
+       TRAIN_OUT_FILE=FLAGS.externaltrainfile
+    else:
+       MODE='test'
+
+    featExt = extractors.get(FLAGS.feature_extarctor)
+
+    sents = transform_conll_sents(FLAGS.train_data)
+
+    if MODE=="write":
+       fout = file(TRAIN_OUT_FILE,"w")
+       trainer = LoggingActionDecider(ArcEagerParsingOracle(pop_when_can=FLAGS.lazypop),featExt,fout)
+       p = ArcEagerParser( trainer)
+       for i,sent in enumerate(sents):
+          sys.stderr.write(". %s " % i)
+          sys.stderr.flush()
+          d=p.parse(sent)
+       sys.exit()
+
+    if MODE=="train":
+       fout = file(TRAIN_OUT_FILE, "w")
+       nactions = 4
+       trainer = MLTrainerActionDecider(ml.MultitronParameters(nactions), ArcEagerParsingOracle(pop_when_can=FLAGS.lazypop), featExt)
+       p = ArcEagerParser( trainer)
+       import random
+       random.seed("seed")
+       #random.shuffle(sents)
+       total = len(sents)
+       for x in xrange(FLAGS.epoch): # epoch
+          logging.info("iter %s/%s",x, FLAGS.epoch)
+          for i,sent in enumerate(sents):
+             if i % 500 == 0: logging.info("step %s/%s ...", i, total)
+             try:
+                d=p.parse(sent)
+             except IndexError,e:
+                logging.info("prob in sent: %s",i)
+                logging.info("\n".join(["%s %s %s %s" % (t['id'],t['form'],t['tag'],t['parent']) for t in sent]))
+                raise e
+       trainer.save(fout)
   
-if opts.SCORES_OUT:
-   scores_out.close()
-
-if opts.eval:
-   print "accuracy:", good/(good+bad)
-   print "complete:", complete/len(sents)
-   preds = set([(i,p,c) for i,p,c in preds if p != -1])
-   print "recall:", len(preds.intersection(reals))/float(len(reals))
-   print "precision:", len(preds.intersection(reals))/float(len(preds))
-   print "assigned:",len(preds)/float(len(reals))
-   
+def main(argv):
+    print('Running under Python {0[0]}.{0[1]}.{0[2]}'.format(sys.version_info), file=sys.stderr)
+    if FLAGS.train: train()
+    if FLAGS.test: test()
+
+if __name__ == '__main__':
+    # FLAGS([__file__, '--verbosity', '1'])
+    app.run(main)
diff --git a/app/pio/io.py b/app/pio/io.py
index aed172d..d606bda 100644
--- a/app/pio/io.py
+++ b/app/pio/io.py
@@ -16,30 +16,50 @@
 """
 Author: Yoav Goldberg
 """
+from __future__ import print_function
+from __future__ import division
+
+import os
 import sys
+curdir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(curdir)
+
+if sys.version_info[0] < 3:
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+    # raise "Must be using Python 3"
+
+from absl import app
+from absl import flags
+from absl import logging
+
 import yutils
 from collections import defaultdict
-
-sys.path.append("..")
 import common
 
 def to_tok(line):
-   if line[4]=="_": line[4]=line[3]
-   return {"parent": int(line[-4]),
-           "prel"  : line[-3],
+    # if line[4]=="_": line[4]=line[3]
+    return {"parent": int(line[6]),
+           "prel"  : line[7],
            "form"  : line[1], 
            "lem"  : line[2], 
            "id"    : int(line[0]), 
            "tag"   : line[4],
            "ctag"   : line[3],
-           "morph" : line[-5].split("|"),
-           "extra" :  line[-1],
+           "morph" : line[5].split("|"),
+           "extra" :  line[9],
            }
 
 def conll_to_sents(fh,ignore_errs=True):
    for sent in yutils.tokenize_blanks(fh):
       if ignore_errs and sent[0][0][0]=="@": continue
-      yield [to_tok(l) for l in sent]
+      lines = []
+      for x in sent:
+        if x[0].strip().startswith("#"): continue
+        if x[6].strip() == "_" or x[7].strip() == "_": continue
+        if len(x) != 10: continue
+        lines.append(x)
+      if len(lines) > 0: yield [to_tok(l) for l in lines]
 
 def ann_conll_to_sents(fh):
    sent=[]
diff --git a/app/conll.example b/data/conll.example
similarity index 100%
rename from app/conll.example
rename to data/conll.example
diff --git a/tmp/.gitignore b/tmp/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/tmp/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore