From 606de016ab85786cf821457c13eead045aa9c2e8 Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Fri, 16 Mar 2018 18:46:43 +0800 Subject: [PATCH] Enable train & test with yoavg's code --- .gitignore | 1 - README.md | 29 +++- admin/test.sh | 18 +++ admin/{dev.sh => train.sh} | 6 +- app/README | 43 ----- app/eager.py | 305 +++++++++++++++++++++--------------- app/pio/io.py | 36 ++++- {app => data}/conll.example | 0 tmp/.gitignore | 2 + 9 files changed, 261 insertions(+), 179 deletions(-) create mode 100755 admin/test.sh rename admin/{dev.sh => train.sh} (60%) delete mode 100644 app/README rename {app => data}/conll.example (100%) create mode 100644 tmp/.gitignore diff --git a/.gitignore b/.gitignore index d1ac3e2..da62e48 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ *.pyc jmeter.log __pycache__ -tmp/ node_modules/ sftp-config.json .DS_Store diff --git a/README.md b/README.md index 596a552..3680cda 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # text-dependency-parser 依存关系分析 +![](https://camo.githubusercontent.com/ae91a5698ad80d3fe8e0eb5a4c6ee7170e088a7d/687474703a2f2f37786b6571692e636f6d312e7a302e676c622e636c6f7564646e2e636f6d2f61692f53637265656e25323053686f74253230323031372d30342d30342532306174253230382e32302e3437253230504d2e706e67) + ## Data format: [CoNLL-2009 Shared Task](http://ufal.mff.cuni.cz/conll2009-st/task-description.html) -### universaldependencies +### Universal Dependencies http://universaldependencies.org/ ### 采用清华大学语义依存网络语料的20000句作为训练集。 @@ -13,5 +15,26 @@ http://www.hankcs.com/nlp/corpus/chinese-treebank.html#h3-6 ### 汉语树库 http://www.hankcs.com/nlp/corpus/chinese-treebank.html -### Transition-based dependency parsers -https://www.cs.bgu.ac.il/~yoavg/software/transitionparser/ \ No newline at end of file +## Run + +Train model. + +``` +admin/train.sh +``` + +Test model. + +``` +admin/test.sh +``` + +# Give credits to + +[Transition Based Dependency Parsers](https://www.cs.bgu.ac.il/~yoavg/software/transitionparser/) + +References: +~~~~~~~~~~~ +[1] Liang Huang, Wenbin Jiang and Qun Liu. 2009. + Bilingually-Constrained (Monolingual) Shift-Reduce Parsing. + \ No newline at end of file diff --git a/admin/test.sh b/admin/test.sh new file mode 100755 index 0000000..4c0c5ae --- /dev/null +++ b/admin/test.sh @@ -0,0 +1,18 @@ +#! /bin/bash +########################################### +# +########################################### + +# constants +baseDir=$(cd `dirname "$0"`;pwd) +# functions + +# main +[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return +cd $baseDir/../app +python eager.py \ + --verbosity=1 \ + --test=True \ + --model=$baseDir/../tmp/eager.model \ + --test_data=$baseDir/../data/UD_English-EWT/en-ud-test.conllu \ + --test_results=$baseDir/../tmp/en-ud-test.results \ \ No newline at end of file diff --git a/admin/dev.sh b/admin/train.sh similarity index 60% rename from admin/dev.sh rename to admin/train.sh index 7460d99..59f294c 100755 --- a/admin/dev.sh +++ b/admin/train.sh @@ -10,4 +10,8 @@ baseDir=$(cd `dirname "$0"`;pwd) # main [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return cd $baseDir/../app -python parser.py Test.test_UD_English_EWT \ No newline at end of file +python eager.py \ + --verbosity=1 \ + --train=True \ + --train_data=$baseDir/../data/UD_English-EWT/en-ud-train.conllu \ + --model=$baseDir/../tmp/eager.model \ \ No newline at end of file diff --git a/app/README b/app/README deleted file mode 100644 index 5644bc2..0000000 --- a/app/README +++ /dev/null @@ -1,43 +0,0 @@ -Transition Based Dependency Parsers - -These are implementations of the (unlabeled) arc-eager and arc-standard dependency parsing algorithms. -These parsers are very fast and are reasonably accurate. -In particular, the arc-standard parser with the features described in [1] (the default feature set) can achieve very competitive accuracies. - -The input file for both training and parsing should be in CoNLL format (see conll.example). -Columns 8,9,10 are always ignored (but must be present). -When parsing new text, you can put whatever you want in column 7, the parser will overwrite it (it uses this column to report accuracy scores) - -Compiling: -========== -Speed is achieved using a c/cython extension module. -This needs to be compiled using either cython or a c compiler. -See instructions in ml/README - -Training the parsers: -===================== - - ./eager.py -o model_file [options] conll_input_file - - or - - ./standard.py -o model_file [options] conll_input_file - - (use -f instead of -o to create feature vector files for training with an external classifier. If you don't know what it means, - just ignore this option. The model file format is the same as Megam's.) - -Parsing new text with the trained model: -======================================== - - ./eager.py -m model_file [options] conll_file_to_parse > output - - or - - ./standard.py -m model_file [options] conll_file_to_parse > output - - -References: -~~~~~~~~~~~ -[1] Liang Huang, Wenbin Jiang and Qun Liu. 2009. - Bilingually-Constrained (Monolingual) Shift-Reduce Parsing. - diff --git a/app/eager.py b/app/eager.py index 21a245c..6f2a314 100755 --- a/app/eager.py +++ b/app/eager.py @@ -18,134 +18,193 @@ Author: Yoav Goldberg (yoav.goldberg@gmail.com) """ -from features import extractors -from params import parser +from __future__ import print_function +from __future__ import division -opts, args = parser.parse_args() - -if opts.trainfile: - MODE='train' - TRAIN_OUT_FILE=opts.trainfile -elif opts.externaltrainfile: - MODE='write' - TRAIN_OUT_FILE=opts.externaltrainfile -else: - MODE='test' - -if opts.SCORES_OUT: - scores_out = file("eager.scores","w") +import os +import sys +curdir = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(curdir) -DATA_FILE=args[0] +if sys.version_info[0] < 3: + reload(sys) + sys.setdefaultencoding("utf-8") + # raise "Must be using Python 3" -######## +from absl import app +from absl import flags +from absl import logging -import sys from ml import ml - from pio import io from transitionparser import * - -featExt = extractors.get(opts.feature_extarctor) - -sents = list(io.conll_to_sents(file(DATA_FILE))) - -if opts.only_proj: - import isprojective - sents = [s for s in sents if isprojective.is_projective(s)] - -if opts.UNLEX: - from shared.lemmatize import EnglishMinimalWordSmoother - smoother = EnglishMinimalWordSmoother.from_words_file("1000words") - for sent in sents: - for tok in sent: - tok['oform']=tok['form'] - tok['form'] = smoother.get(tok['form']) - -if MODE=="write": - fout = file(TRAIN_OUT_FILE,"w") - trainer = LoggingActionDecider(ArcEagerParsingOracle(pop_when_can=opts.POP_WHEN_CAN),featExt,fout) - p = ArcEagerParser( trainer) - for i,sent in enumerate(sents): - sys.stderr.write(". %s " % i) - sys.stderr.flush() - d=p.parse(sent) - sys.exit() - - -if MODE=="train": - fout = file(TRAIN_OUT_FILE,"w") - nactions = 4 - trainer = MLTrainerActionDecider(ml.MultitronParameters(nactions), ArcEagerParsingOracle(pop_when_can=opts.POP_WHEN_CAN), featExt) - p = ArcEagerParser( trainer) - import random - random.seed("seed") - #random.shuffle(sents) - for x in xrange(10): - print "iter ",x - for i,sent in enumerate(sents): - if i % 500 == 0: print i, - try: - d=p.parse(sent) - except IndexError,e: - print "prob in sent:",i - print "\n".join(["%s %s %s %s" % (t['id'],t['form'],t['tag'],t['parent']) for t in sent]) - raise e - trainer.save(fout) - sys.exit() -# test -elif MODE=="test": - p = ArcEagerParser(MLActionDecider(ml.MulticlassModel(opts.modelfile),featExt)) - -good = 0.0 -bad = 0.0 -complete=0.0 - -#main test loop -reals = set() -preds = set() - -for i,sent in enumerate(sents): - sgood=0.0 - sbad=0.0 - mistake=False - sys.stderr.write("%s %s %s\n"% ( "@@@",i,good/(good+bad+1))) - try: - d=p.parse(sent) - except MLTrainerWrongActionException: - # this happens only in "early update" parsers, and then we just go on to - # the next sentence.. - continue - sent = d.annotate_allow_none(sent) - for tok in sent: - if opts.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue - reals.add((i,tok['parent'],tok['id'])) - preds.add((i,tok['pparent'],tok['id'])) - if tok['pparent']==-1:continue - if tok['parent']==tok['pparent'] or tok['pparent']==-1: - good+=1 - sgood+=1 - else: - bad+=1 - sbad+=1 - mistake=True - #print - if opts.UNLEX: - io.out_conll(sent,parent='pparent',form='oform') - else: - io.out_conll(sent,parent='pparent',form='form') - if not mistake: complete+=1 - #sys.exit() - if opts.SCORES_OUT: - scores_out.write("%s\n" % (sgood/(sgood+sbad))) +from features import extractors + +FLAGS = flags.FLAGS +''' +General +''' + +flags.DEFINE_boolean('ignore_punc', False, 'Ignore Punct File.') +flags.DEFINE_boolean('only_projective', False, 'Only Projective.') +flags.DEFINE_boolean('lazypop', True, 'Lazy pop.') +flags.DEFINE_boolean('unlex', False, 'unlex') +flags.DEFINE_string('feature_extarctor', 'eager.zhang', 'Feature Extarctor') +flags.DEFINE_string('model', os.path.join(curdir, os.path.pardir, "tmp", "eager.model"), 'Transition Parser Model.') + +''' +Train +''' +flags.DEFINE_boolean('train', False, 'Train model with train data') +flags.DEFINE_integer('epoch', 1, 'Train Epoch.') +flags.DEFINE_string('train_data', os.path.join(curdir, os.path.pardir, "data", "conll.example"), 'Train Data') + +flags.DEFINE_string('externaltrainfile', None, 'External Train File.') +# flags.DEFINE_string('modelfile', 'data/weights', 'Model File.') + +''' +Test +''' +flags.DEFINE_boolean('test', False, 'Evalutate with test data') +flags.DEFINE_string('test_data', os.path.join(curdir, os.path.pardir, "data", "conll.example"), 'Test data.') +flags.DEFINE_string('test_results', os.path.join(curdir, os.path.pardir, "tmp", "eager.test.results"), 'Save scores into disk.') + +def transform_conll_sents(conll_file_path): + ''' + Transform CoNLL data as feeding + ''' + sents = list(io.conll_to_sents(file(conll_file_path))) + + if FLAGS.only_projective: + import isprojective + sents = [s for s in sents if isprojective.is_projective(s)] + + if FLAGS.unlex: + from shared.lemmatize import EnglishMinimalWordSmoother + smoother = EnglishMinimalWordSmoother.from_words_file("1000words") + for sent in sents: + for tok in sent: + tok['oform']=tok['form'] + tok['form'] = smoother.get(tok['form']) + + return sents + +def test(): + ''' + Test Model + ''' + logging.info("test ...") + featExt = extractors.get(FLAGS.feature_extarctor) + p = ArcEagerParser(MLActionDecider(ml.MulticlassModel(FLAGS.model), featExt)) + + good = 0.0 + bad = 0.0 + complete = 0.0 + + #main test loop + reals = set() + preds = set() + with open(FLAGS.test_data, "r") as fin, open(FLAGS.test_results, "w") as fout: + sents = transform_conll_sents(FLAGS.test_data) + for i,sent in enumerate(sents): + sgood=0.0 + sbad=0.0 + mistake=False + sys.stderr.write("%s %s %s\n"% ( "@@@",i,good/(good+bad+1))) + try: + d=p.parse(sent) + except MLTrainerWrongActionException: + # this happens only in "early update" parsers, and then we just go on to + # the next sentence.. + continue + sent = d.annotate_allow_none(sent) + for tok in sent: + if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue + reals.add((i,tok['parent'],tok['id'])) + preds.add((i,tok['pparent'],tok['id'])) + if tok['pparent']==-1:continue + if tok['parent']==tok['pparent'] or tok['pparent']==-1: + good+=1 + sgood+=1 + else: + bad+=1 + sbad+=1 + mistake=True + #print + if FLAGS.unlex: + io.out_conll(sent,parent='pparent',form='oform') + else: + io.out_conll(sent,parent='pparent',form='form') + if not mistake: complete+=1 + #sys.exit() + logging.info("test result: sgood[%s], sbad[%s]", sgood, sbad) + if sgood > 0.0 and sbad > 0.0: + fout.write("%s\n" % (sgood/(sgood+sbad))) + + logging.info("accuracy: %s", good/(good+bad)) + logging.info("complete: %s", complete/len(sents)) + preds = set([(i,p,c) for i,p,c in preds if p != -1]) + logging.info("recall: %s", len(preds.intersection(reals))/float(len(reals))) + logging.info("precision: %s", len(preds.intersection(reals))/float(len(preds))) + logging.info("assigned: %s",len(preds)/float(len(reals))) + +def train(): + ''' + Train Model + ''' + if FLAGS.model: + MODE='train' + TRAIN_OUT_FILE=FLAGS.model + elif FLAGS.externaltrainfile: + ''' + create feature vector files for training with an external classifier. If you don't know what it means, + just ignore this option. The model file format is the same as Megam's. + ''' + MODE='write' + TRAIN_OUT_FILE=FLAGS.externaltrainfile + else: + MODE='test' + + featExt = extractors.get(FLAGS.feature_extarctor) + + sents = transform_conll_sents(FLAGS.train_data) + + if MODE=="write": + fout = file(TRAIN_OUT_FILE,"w") + trainer = LoggingActionDecider(ArcEagerParsingOracle(pop_when_can=FLAGS.lazypop),featExt,fout) + p = ArcEagerParser( trainer) + for i,sent in enumerate(sents): + sys.stderr.write(". %s " % i) + sys.stderr.flush() + d=p.parse(sent) + sys.exit() + + if MODE=="train": + fout = file(TRAIN_OUT_FILE, "w") + nactions = 4 + trainer = MLTrainerActionDecider(ml.MultitronParameters(nactions), ArcEagerParsingOracle(pop_when_can=FLAGS.lazypop), featExt) + p = ArcEagerParser( trainer) + import random + random.seed("seed") + #random.shuffle(sents) + total = len(sents) + for x in xrange(FLAGS.epoch): # epoch + logging.info("iter %s/%s",x, FLAGS.epoch) + for i,sent in enumerate(sents): + if i % 500 == 0: logging.info("step %s/%s ...", i, total) + try: + d=p.parse(sent) + except IndexError,e: + logging.info("prob in sent: %s",i) + logging.info("\n".join(["%s %s %s %s" % (t['id'],t['form'],t['tag'],t['parent']) for t in sent])) + raise e + trainer.save(fout) -if opts.SCORES_OUT: - scores_out.close() - -if opts.eval: - print "accuracy:", good/(good+bad) - print "complete:", complete/len(sents) - preds = set([(i,p,c) for i,p,c in preds if p != -1]) - print "recall:", len(preds.intersection(reals))/float(len(reals)) - print "precision:", len(preds.intersection(reals))/float(len(preds)) - print "assigned:",len(preds)/float(len(reals)) - +def main(argv): + print('Running under Python {0[0]}.{0[1]}.{0[2]}'.format(sys.version_info), file=sys.stderr) + if FLAGS.train: train() + if FLAGS.test: test() + +if __name__ == '__main__': + # FLAGS([__file__, '--verbosity', '1']) + app.run(main) diff --git a/app/pio/io.py b/app/pio/io.py index aed172d..d606bda 100644 --- a/app/pio/io.py +++ b/app/pio/io.py @@ -16,30 +16,50 @@ """ Author: Yoav Goldberg """ +from __future__ import print_function +from __future__ import division + +import os import sys +curdir = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(curdir) + +if sys.version_info[0] < 3: + reload(sys) + sys.setdefaultencoding("utf-8") + # raise "Must be using Python 3" + +from absl import app +from absl import flags +from absl import logging + import yutils from collections import defaultdict - -sys.path.append("..") import common def to_tok(line): - if line[4]=="_": line[4]=line[3] - return {"parent": int(line[-4]), - "prel" : line[-3], + # if line[4]=="_": line[4]=line[3] + return {"parent": int(line[6]), + "prel" : line[7], "form" : line[1], "lem" : line[2], "id" : int(line[0]), "tag" : line[4], "ctag" : line[3], - "morph" : line[-5].split("|"), - "extra" : line[-1], + "morph" : line[5].split("|"), + "extra" : line[9], } def conll_to_sents(fh,ignore_errs=True): for sent in yutils.tokenize_blanks(fh): if ignore_errs and sent[0][0][0]=="@": continue - yield [to_tok(l) for l in sent] + lines = [] + for x in sent: + if x[0].strip().startswith("#"): continue + if x[6].strip() == "_" or x[7].strip() == "_": continue + if len(x) != 10: continue + lines.append(x) + if len(lines) > 0: yield [to_tok(l) for l in lines] def ann_conll_to_sents(fh): sent=[] diff --git a/app/conll.example b/data/conll.example similarity index 100% rename from app/conll.example rename to data/conll.example diff --git a/tmp/.gitignore b/tmp/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/tmp/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore