diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5f32f73 Binary files /dev/null and b/.DS_Store differ diff --git a/amr_parsing.py b/amr_parsing.py index a570738..dafa8e2 100755 --- a/amr_parsing.py +++ b/amr_parsing.py @@ -132,13 +132,16 @@ def main(): arg_parser.add_argument('-s','--start_step',type=int,default=0,help='specify which step to begin oracle testing;for debug') #arg_parser.add_argument('-i','--input_file',help='the input: preprocessed data instances file for aligner or training') arg_parser.add_argument('-d','--dev',help='development file') + arg_parser.add_argument('-a','--add',help='additional training file') arg_parser.add_argument('-as','--actionset',choices=['basic'],default='basic',help='choose different action set') arg_parser.add_argument('-m','--mode',choices=['preprocess','test_gold_graph','align','userGuide','oracleGuide','train','parse','eval'],help="preprocess:generate pos tag, dependency tree, ner\n" "align:do alignment between AMR graph and sentence string") arg_parser.add_argument('-dp','--depparser',choices=['stanford','stanfordConvert','stdconv+charniak','clear','mate','turbo'],default='stdconv+charniak',help='choose the dependency parser') arg_parser.add_argument('--coref',action='store_true',help='flag to enable coreference information') arg_parser.add_argument('--prop',action='store_true',help='flag to enable semantic role labeling information') arg_parser.add_argument('--rne',action='store_true',help='flag to enable rich name entity') - arg_parser.add_argument('--onto',action='store_true',help='flag to enable charniak parse result trained on ontonotes') + arg_parser.add_argument('--verblist',action='store_true',help='flag to enable verbalization list') + #arg_parser.add_argument('--onto',action='store_true',help='flag to enable charniak parse result trained on ontonotes') + arg_parser.add_argument('--onto',choices=['onto','onto+bolt','wsj'],default='wsj',help='choose which charniak parse result trained on ontonotes') arg_parser.add_argument('--model',help='specify the model file') arg_parser.add_argument('--feat',help='feature template file') arg_parser.add_argument('-iter','--iterations',default=1,type=int,help='training iterations') @@ -156,6 +159,7 @@ def main(): constants.FLAG_COREF=args.coref constants.FLAG_PROP=args.prop constants.FLAG_RNE=args.rne + constants.FLAG_VERB=args.verblist constants.FLAG_ONTO=args.onto constants.FLAG_DEPPARSER=args.depparser @@ -308,9 +312,11 @@ def main(): print "Incorporate Coref Information: %s"%(constants.FLAG_COREF) print "Incorporate SRL Information: %s"%(constants.FLAG_PROP) print "Substitue the normal name entity tag with rich name entity tag: %s"%(constants.FLAG_RNE) + print "Using verbalization list: %s"%(constants.FLAG_VERB) print "Using charniak parser trained on ontonotes: %s"%(constants.FLAG_ONTO) print "Dependency parser used: %s"%(constants.FLAG_DEPPARSER) - train_instances = preprocess(amr_file,START_SNLP=False) + train_instances = preprocess(amr_file,START_SNLP=False) + if args.add: train_instances = train_instances + preprocess(args.add,START_SNLP=False) if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False) @@ -330,6 +336,11 @@ def main(): model.setup(action_type=args.actionset,instances=train_instances,parser=parser,feature_templates_file=feat_template) print >> experiment_log, "BEGIN TRAINING!" + best_fscore = 0.0 + best_pscore = 0.0 + best_rscore = 0.0 + best_model = None + best_iter = 1 for iter in xrange(1,args.iterations+1): print >> experiment_log, "shuffling training instances" random.shuffle(train_instances) @@ -338,22 +349,36 @@ def main(): begin_updates = parser.perceptron.get_num_updates() parser.parse_corpus_train(train_instances) parser.perceptron.average_weight() - #model.save_model(args.model+'-iter'+str(iter)+'-'+str(int(time.time()))+'.m') - model.save_model(args.model+'-iter'+str(iter)+'.m') + if args.dev: print >> experiment_log ,"Result on develop set:" _,parsed_amr = parser.parse_corpus_test(dev_instances) - write_parsed_amr(parsed_amr,dev_instances,args.dev,args.section+'.'+str(iter)+'.parsed') + parsed_suffix = args.section+'.'+args.model.split('.')[-1]+'.'+str(iter)+'.parsed' + write_parsed_amr(parsed_amr,dev_instances,args.dev,parsed_suffix) if args.smatcheval: smatch_path = "./smatch_2.0.2/smatch.py" python_path = 'python' options = '--pr -f' - parsed_filename = args.dev+'.'+args.section+'.'+str(iter)+'.parsed' + parsed_filename = args.dev+'.'+parsed_suffix command = '%s %s %s %s %s' % (python_path, smatch_path, options, parsed_filename, args.dev) print 'Evaluation using command: ' + (command) - print subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True) - + #print subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True) + eval_output = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True) + print eval_output + pscore = float(eval_output.split('\n')[0].split(':')[1].rstrip()) + rscore = float(eval_output.split('\n')[1].split(':')[1].rstrip()) + fscore = float(eval_output.split('\n')[2].split(':')[1].rstrip()) + if fscore > best_fscore: + best_model = model + best_iter = iter + best_fscore = fscore + best_pscore = pscore + best_rscore = rscore + + if best_model is not None: + print >> experiment_log, "Best result on iteration %d:\n Precision: %f\n Recall: %f\n F-score: %f" % (best_iter, best_pscore, best_rscore, best_fscore) + best_model.save_model(args.model+'.m') print >> experiment_log ,"DONE TRAINING!" elif args.mode == 'parse': # actual parsing @@ -369,7 +394,8 @@ def main(): parser = Parser(model=model,oracle_type=DET_T2G_ORACLE_ABT,action_type=args.actionset,verbose=args.verbose,elog=experiment_log) print >> experiment_log ,"BEGIN PARSING" span_graph_pairs,results = parser.parse_corpus_test(test_instances) - write_parsed_amr(results,test_instances,amr_file,suffix='%s.parsed'%(args.section)) + parsed_suffix = '%s.%s.parsed'%(args.section,args.model.split('.')[-2]) + write_parsed_amr(results,test_instances,amr_file,suffix=parsed_suffix) #write_span_graph(span_graph_pairs,test_instances,amr_file,suffix='spg.50') ################ # for eval # @@ -381,7 +407,7 @@ def main(): smatch_path = "./smatch_2.0.2/smatch.py" python_path = 'python' options = '--pr -f' - parsed_filename = amr_file+'.'+args.section+'.parsed' + parsed_filename = amr_file+'.'+parsed_suffix command = '%s %s %s %s %s' % (python_path,smatch_path,options,parsed_filename, amr_file) print 'Evaluation using command: ' + (command) diff --git a/constants.py b/constants.py index d27e896..904a3fb 100644 --- a/constants.py +++ b/constants.py @@ -10,8 +10,9 @@ FLAG_COREF=False FLAG_PROP=False FLAG_RNE=False +FLAG_VERB=False FLAG_DEPPARSER='stanford' -FLAG_ONTO=False +FLAG_ONTO='wsj' # constants NOT_APPLY='_NOT_APPLY_' diff --git a/preprocessing.py b/preprocessing.py index f6e143c..f01e4ae 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -52,7 +52,7 @@ def readAMREval(eval_file_path): ''' read in semeval evaluation format (without amr) ''' - eval_file = codecs.open(eval_file_path,'r',encoding='macroman') + eval_file = codecs.open(eval_file_path,'r',encoding='utf-8') comment_list = [] comment = OrderedDict() #amr_list = [] @@ -145,7 +145,11 @@ def _load_cparse(cparse_filename): def _fix_prop_head(inst,ctree,start_index,height): head_index = None - tree_pos = ctree.leaf_treeposition(start_index) + try: + tree_pos = ctree.leaf_treeposition(start_index) + except IndexError: + import pdb + pdb.set_trace() span_root = ctree[tree_pos[:-(height+1)]] end_index = start_index + len(span_root.leaves()) cur = inst.tokens[start_index+1] @@ -330,7 +334,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'): if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) # use verbalization list to fix the unaligned tokens - Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment) + if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) @@ -419,7 +423,12 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'): _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": - dep_filename = tok_sent_filename+'.charniak.onto.parse.dep' if constants.FLAG_ONTO else tok_sent_filename+'.charniak.parse.dep' + if constants.FLAG_ONTO == 'onto': + dep_filename = tok_sent_filename+'.charniak.onto.parse.dep' + elif constants.FLAG_ONTO == 'onto+bolt': + dep_filename = tok_sent_filename+'.charniak.onto+bolt.parse.dep' + else: + dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) @@ -463,7 +472,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'): if constants.FLAG_PROP: print >> log, "Adding SRL information..." - prop_filename = tok_sent_filename + '.prop' + prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) diff --git a/scripts/cmd.test.brown-verb-onto-rne.sh b/scripts/cmd.test.brown-verb-onto-rne.sh new file mode 100644 index 0000000..d9375c8 --- /dev/null +++ b/scripts/cmd.test.brown-verb-onto-rne.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Testing ..." +/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --onto onto --rne --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb-onto-rne.log 2>&1 & diff --git a/scripts/cmd.test.brown-verb-onto.sh b/scripts/cmd.test.brown-verb-onto.sh new file mode 100644 index 0000000..69beb43 --- /dev/null +++ b/scripts/cmd.test.brown-verb-onto.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Testing ..." +/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --onto onto --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb-onto.log 2>&1 & diff --git a/scripts/cmd.test.brown-verb-rne.sh b/scripts/cmd.test.brown-verb-rne.sh new file mode 100644 index 0000000..e44a4e8 --- /dev/null +++ b/scripts/cmd.test.brown-verb-rne.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Testing ..." +/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --rne --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-rne.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb-rne.log 2>&1 & diff --git a/scripts/cmd.test.brown-verb.sh b/scripts/cmd.test.brown-verb.sh new file mode 100644 index 0000000..399631e --- /dev/null +++ b/scripts/cmd.test.brown-verb.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Testing ..." +/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb.log 2>&1 & diff --git a/scripts/cmd.test.sh b/scripts/cmd.test.sh new file mode 100644 index 0000000..2d4f930 --- /dev/null +++ b/scripts/cmd.test.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Testing ..." +/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.basic-abt.log 2>&1 & diff --git a/scripts/cmd.test.verb.sh b/scripts/cmd.test.verb.sh new file mode 100644 index 0000000..c9ac6e0 --- /dev/null +++ b/scripts/cmd.test.verb.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Testing ..." +/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-verb.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.basic-abt-verb.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb-onto+bolt-rne-srl.sh b/scripts/cmd.train.brown-verb-onto+bolt-rne-srl.sh new file mode 100644 index 0000000..29c3be7 --- /dev/null +++ b/scripts/cmd.train.brown-verb-onto+bolt-rne-srl.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --prop --rne --onto 'onto+bolt' --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto+bolt-rne-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto+bolt-rne-srl.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb-onto-rne-srl.sh b/scripts/cmd.train.brown-verb-onto-rne-srl.sh index 6616d33..595e7d2 100644 --- a/scripts/cmd.train.brown-verb-onto-rne-srl.sh +++ b/scripts/cmd.train.brown-verb-onto-rne-srl.sh @@ -1,4 +1,4 @@ #!/bin/sh echo "Training Model ..." -/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --prop --rne --onto --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/test.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl.log 2>&1 & +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --prop --rne --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb-onto-rne.sh b/scripts/cmd.train.brown-verb-onto-rne.sh index 25fab4c..504f757 100644 --- a/scripts/cmd.train.brown-verb-onto-rne.sh +++ b/scripts/cmd.train.brown-verb-onto-rne.sh @@ -1,4 +1,4 @@ #!/bin/sh echo "Training Model ..." -/usr/bin/python amr_parsing.py -m train --amrfmt amr --rne --onto --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne.log 2>&1 & +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --rne --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb-onto-srl.sh b/scripts/cmd.train.brown-verb-onto-srl.sh new file mode 100644 index 0000000..10d7c03 --- /dev/null +++ b/scripts/cmd.train.brown-verb-onto-srl.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --prop --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-srl.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb-onto.sh b/scripts/cmd.train.brown-verb-onto.sh new file mode 100644 index 0000000..b28e9cd --- /dev/null +++ b/scripts/cmd.train.brown-verb-onto.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb-rne.sh b/scripts/cmd.train.brown-verb-rne.sh new file mode 100755 index 0000000..6d5e968 --- /dev/null +++ b/scripts/cmd.train.brown-verb-rne.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --verblist --smatcheval --rne --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-rne -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-rne.log 2>&1 & diff --git a/scripts/cmd.train.brown-verb.sh b/scripts/cmd.train.brown-verb.sh new file mode 100755 index 0000000..6967dba --- /dev/null +++ b/scripts/cmd.train.brown-verb.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python amr_parsing.py -m train --amrfmt amr --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb.log 2>&1 & diff --git a/scripts/cmd.train.onto-verb.sh b/scripts/cmd.train.onto-verb.sh new file mode 100644 index 0000000..e5269c8 --- /dev/null +++ b/scripts/cmd.train.onto-verb.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-onto-verb -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-onto-verb.log 2>&1 & diff --git a/scripts/cmd.train.onto.sh b/scripts/cmd.train.onto.sh new file mode 100644 index 0000000..a706272 --- /dev/null +++ b/scripts/cmd.train.onto.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-onto -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-onto.log 2>&1 & diff --git a/scripts/cmd.train.sh b/scripts/cmd.train.sh new file mode 100755 index 0000000..38b94ae --- /dev/null +++ b/scripts/cmd.train.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python amr_parsing.py -m train --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt.log 2>&1 & diff --git a/scripts/cmd.train.verb.sh b/scripts/cmd.train.verb.sh new file mode 100755 index 0000000..82d90a0 --- /dev/null +++ b/scripts/cmd.train.verb.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo "Training Model ..." +/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-verb -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-verb.log 2>&1 & diff --git a/scripts/fix_multi_sent.py b/scripts/fix_multi_sent.py new file mode 100644 index 0000000..6f41ee7 --- /dev/null +++ b/scripts/fix_multi_sent.py @@ -0,0 +1,34 @@ + +''' +fix charniak parse file with multiple sentences +''' +from nltk.tree import Tree +import sys +import re +import codecs + +def fix_multi_sent(line): + tree = Tree.fromstring(line) + if len(tree) > 1: + newtree = Tree('S1',[Tree('S',tree[:])]) + else: + newtree = tree + return re.sub('\n\s*',' ',newtree.__str__()) + + +if __name__ == '__main__': + old_parse_file = sys.argv[1] + new_parse_file = old_parse_file.rsplit('.',1)[0] + print '%s >> %s' % (old_parse_file, new_parse_file) + result = [] + with codecs.open(old_parse_file,'r',encoding='utf-8') as f: + for line in f: + line = line.strip() + new = fix_multi_sent(line) + result.append(new) + + with codecs.open(new_parse_file, 'w', encoding='utf-8') as wf: + wf.write('\n'.join(result)) + wf.write('\n') + + diff --git a/temp/feats_gen_basic_abt_srl_feats.py b/temp/feats_gen_basic_abt_srl_feats.py new file mode 100644 index 0000000..6bd055f --- /dev/null +++ b/temp/feats_gen_basic_abt_srl_feats.py @@ -0,0 +1,122 @@ +#generated by model.py +from constants import * +def generate_features(state,action): + s0,b0,a0=state.get_feature_context_window(action) + feats=[] + act_idx = state.model.class_codebook.get_index(action['type']) + tx = action['tag'] if 'tag' in action else EMPTY + s0_ne=s0['ne'] if s0 else EMPTY + s0_w=s0['form'] if s0 else EMPTY + s0_lemma=s0['lemma'] if s0 else EMPTY + s0_t=s0['pos'] if s0 else EMPTY + s0_dl=s0['rel'] if s0 else EMPTY + s0_len=s0['len'] if s0 else EMPTY + s0_txv=s0['txv'] if s0 else EMPTY + s0_frmset=s0['frmset'] if s0 else EMPTY + s0_eqfrmset=s0['eqfrmset'] if s0 else EMPTY + b0_isarg=b0['isarg'] if b0 else EMPTY + b0_arglabel=b0['arglabel'] if b0 else EMPTY + b0_dl=b0['rel'] if b0 else EMPTY + a0_isprd=a0['isprd'] if a0 else EMPTY + a0_prdlabel=a0['prdlabel'] if a0 else EMPTY + b0_isprd=b0['isprd'] if b0 else EMPTY + b0_prdlabel=b0['prdlabel'] if b0 else EMPTY + a0_isarg=a0['isarg'] if a0 else EMPTY + a0_arglabel=a0['arglabel'] if a0 else EMPTY + s0_isnom=s0['isnom'] if s0 else EMPTY + s0_nech=s0['nech'] if s0 else EMPTY + s0_c1lemma=s0['c1lemma'] if s0 else EMPTY + s0_c1dl=s0['c1dl'] if s0 else EMPTY + s0_cpt=s0['concept'] if s0 else EMPTY + s0_p1_ne=s0['p1']['ne'] if s0 and s0['p1'] else EMPTY + b0_ne=b0['ne'] if b0 else EMPTY + b0_w=b0['form'] if b0 else EMPTY + b0_lemma=b0['lemma'] if b0 else EMPTY + b0_t=b0['pos'] if b0 else EMPTY + b0_len=b0['len'] if b0 else EMPTY + b0_reph=b0['reph'] if b0 else EMPTY + a0_ne=a0['ne'] if a0 else EMPTY + a0_w=a0['form'] if a0 else EMPTY + a0_lemma=a0['lemma'] if a0 else EMPTY + a0_t=a0['pos'] if a0 else EMPTY + a0_dl=a0['rel'] if a0 else EMPTY + s0_p1_w=s0['p1']['form'] if s0 and s0['p1'] else EMPTY + s0_p1_lemma=s0['p1']['lemma'] if s0 and s0['p1'] else EMPTY + s0_p1_t=s0['p1']['pos'] if s0 and s0['p1'] else EMPTY + s0_p1_dl=s0['p1']['rel'] if s0 and s0['p1'] else EMPTY + b0_pathpwd=b0['pathpwd'] if b0 else EMPTY + b0_apathpwd=b0['apathpwd'] if b0 else EMPTY + b0_rsb_dl=b0['rsb']['rel'] if b0 and b0['rsb'] else EMPTY + b0_nswp=b0['nswp'] if b0 else EMPTY + b0_pathp=b0['pathp'] if b0 else EMPTY + b0_apathp=b0['apathp'] if b0 else EMPTY + a0_cpt=a0['concept'] if a0 else EMPTY + b0_cpt=b0['concept'] if b0 else EMPTY + dist1=abs(s0['id']-b0['id']) if b0 and b0 is not ABT_TOKEN and s0 is not ABT_TOKEN else EMPTY + if dist1 > 10: dist1=10 + dist2=abs(a0['id']-b0['id']) if b0 and a0 and b0 is not ABT_TOKEN and a0 is not ABT_TOKEN else EMPTY + if dist2 > 10: dist2=10 + if [s0_ne,tx] != 2*[None]:feats.append('s0_ne&tx=%s_%s_' % (s0_ne,tx)) + if [s0_w,tx] != 2*[None]:feats.append('s0_w&tx=%s_%s_' % (s0_w,tx)) + if [s0_lemma,tx] != 2*[None]:feats.append('s0_lemma&tx=%s_%s_' % (s0_lemma,tx)) + if [s0_t,tx] != 2*[None]:feats.append('s0_t&tx=%s_%s_' % (s0_t,tx)) + if [s0_dl,tx] != 2*[None]:feats.append('s0_dl&tx=%s_%s_' % (s0_dl,tx)) + if [s0_len,tx] != 2*[None]:feats.append('s0_len&tx=%s_%s_' % (s0_len,tx)) + if [s0_t,s0_txv] != 2*[None]:feats.append('s0_t&s0_txv=%s_%s_' % (s0_t,s0_txv)) + if [s0_frmset] != 1*[None]:feats.append('s0_frmset=%s_' % (s0_frmset)) + if [s0_eqfrmset] != 1*[None]:feats.append('s0_eqfrmset=%s_' % (s0_eqfrmset)) + if [b0_isarg] != 1*[None]:feats.append('b0_isarg=%s_' % (b0_isarg)) + if [b0_arglabel] != 1*[None]:feats.append('b0_arglabel=%s_' % (b0_arglabel)) + if [b0_dl,b0_arglabel] != 2*[None]:feats.append('b0_dl&b0_arglabel=%s_%s_' % (b0_dl,b0_arglabel)) + if [a0_isprd] != 1*[None]:feats.append('a0_isprd=%s_' % (a0_isprd)) + if [a0_prdlabel] != 1*[None]:feats.append('a0_prdlabel=%s_' % (a0_prdlabel)) + if [b0_isprd] != 1*[None]:feats.append('b0_isprd=%s_' % (b0_isprd)) + if [b0_prdlabel] != 1*[None]:feats.append('b0_prdlabel=%s_' % (b0_prdlabel)) + if [a0_isarg] != 1*[None]:feats.append('a0_isarg=%s_' % (a0_isarg)) + if [a0_arglabel] != 1*[None]:feats.append('a0_arglabel=%s_' % (a0_arglabel)) + if [s0_isnom] != 1*[None]:feats.append('s0_isnom=%s_' % (s0_isnom)) + if [s0_nech] != 1*[None]:feats.append('s0_nech=%s_' % (s0_nech)) + if [s0_lemma,s0_nech] != 2*[None]:feats.append('s0_lemma&s0_nech=%s_%s_' % (s0_lemma,s0_nech)) + if [s0_isnom,s0_nech] != 2*[None]:feats.append('s0_isnom&s0_nech=%s_%s_' % (s0_isnom,s0_nech)) + if [s0_c1lemma,tx] != 2*[None]:feats.append('s0_c1lemma&tx=%s_%s_' % (s0_c1lemma,tx)) + if [s0_c1dl,s0_c1lemma] != 2*[None]:feats.append('s0_c1dl&s0_c1lemma=%s_%s_' % (s0_c1dl,s0_c1lemma)) + if [s0_cpt,s0_p1_ne,s0_c1lemma] != 3*[None]:feats.append('s0_cpt&s0_p1_ne&s0_c1lemma=%s_%s_%s_' % (s0_cpt,s0_p1_ne,s0_c1lemma)) + if [b0_ne] != 1*[None]:feats.append('b0_ne=%s_' % (b0_ne)) + if [b0_w] != 1*[None]:feats.append('b0_w=%s_' % (b0_w)) + if [b0_lemma] != 1*[None]:feats.append('b0_lemma=%s_' % (b0_lemma)) + if [b0_t] != 1*[None]:feats.append('b0_t=%s_' % (b0_t)) + if [b0_dl] != 1*[None]:feats.append('b0_dl=%s_' % (b0_dl)) + if [b0_len] != 1*[None]:feats.append('b0_len=%s_' % (b0_len)) + if [b0_reph] != 1*[None]:feats.append('b0_reph=%s_' % (b0_reph)) + if [a0_ne] != 1*[None]:feats.append('a0_ne=%s_' % (a0_ne)) + if [a0_w] != 1*[None]:feats.append('a0_w=%s_' % (a0_w)) + if [a0_lemma] != 1*[None]:feats.append('a0_lemma=%s_' % (a0_lemma)) + if [a0_t] != 1*[None]:feats.append('a0_t=%s_' % (a0_t)) + if [a0_dl] != 1*[None]:feats.append('a0_dl=%s_' % (a0_dl)) + if [s0_p1_ne] != 1*[None]:feats.append('s0_p1_ne=%s_' % (s0_p1_ne)) + if [s0_p1_w] != 1*[None]:feats.append('s0_p1_w=%s_' % (s0_p1_w)) + if [s0_p1_lemma] != 1*[None]:feats.append('s0_p1_lemma=%s_' % (s0_p1_lemma)) + if [s0_p1_t] != 1*[None]:feats.append('s0_p1_t=%s_' % (s0_p1_t)) + if [s0_p1_dl] != 1*[None]:feats.append('s0_p1_dl=%s_' % (s0_p1_dl)) + if [b0_pathpwd,b0_lemma,s0_lemma] != 3*[None]:feats.append('b0_pathpwd&b0_lemma&s0_lemma=%s_%s_%s_' % (b0_pathpwd,b0_lemma,s0_lemma)) + if [b0_apathpwd,a0_lemma,b0_lemma] != 3*[None]:feats.append('b0_apathpwd&a0_lemma&b0_lemma=%s_%s_%s_' % (b0_apathpwd,a0_lemma,b0_lemma)) + if [b0_pathpwd] != 1*[None]:feats.append('b0_pathpwd=%s_' % (b0_pathpwd)) + if [b0_apathpwd] != 1*[None]:feats.append('b0_apathpwd=%s_' % (b0_apathpwd)) + if [b0_lemma,b0_rsb_dl] != 2*[None]:feats.append('b0_lemma&b0_rsb_dl=%s_%s_' % (b0_lemma,b0_rsb_dl)) + if [b0_lemma,b0_nswp] != 2*[None]:feats.append('b0_lemma&b0_nswp=%s_%s_' % (b0_lemma,b0_nswp)) + if [dist1] != 1*[None]:feats.append('dist1=%s_' % (dist1)) + if [dist1,b0_pathp] != 2*[None]:feats.append('dist1&b0_pathp=%s_%s_' % (dist1,b0_pathp)) + if [dist2] != 1*[None]:feats.append('dist2=%s_' % (dist2)) + if [dist2,b0_apathp] != 2*[None]:feats.append('dist2&b0_apathp=%s_%s_' % (dist2,b0_apathp)) + if [s0_lemma,b0_t] != 2*[None]:feats.append('s0_lemma&b0_t=%s_%s_' % (s0_lemma,b0_t)) + if [s0_lemma,b0_dl] != 2*[None]:feats.append('s0_lemma&b0_dl=%s_%s_' % (s0_lemma,b0_dl)) + if [s0_t,b0_lemma] != 2*[None]:feats.append('s0_t&b0_lemma=%s_%s_' % (s0_t,b0_lemma)) + if [s0_dl,b0_lemma] != 2*[None]:feats.append('s0_dl&b0_lemma=%s_%s_' % (s0_dl,b0_lemma)) + if [s0_ne,b0_ne] != 2*[None]:feats.append('s0_ne&b0_ne=%s_%s_' % (s0_ne,b0_ne)) + if [a0_t,b0_lemma] != 2*[None]:feats.append('a0_t&b0_lemma=%s_%s_' % (a0_t,b0_lemma)) + if [a0_dl,b0_lemma] != 2*[None]:feats.append('a0_dl&b0_lemma=%s_%s_' % (a0_dl,b0_lemma)) + if [a0_ne,b0_ne] != 2*[None]:feats.append('a0_ne&b0_ne=%s_%s_' % (a0_ne,b0_ne)) + if [a0_cpt,b0_cpt] != 2*[None]:feats.append('a0_cpt&b0_cpt=%s_%s_' % (a0_cpt,b0_cpt)) + if [a0_cpt,b0_cpt,s0_lemma] != 3*[None]:feats.append('a0_cpt&b0_cpt&s0_lemma=%s_%s_%s_' % (a0_cpt,b0_cpt,s0_lemma)) + if [a0_cpt,b0_ne] != 2*[None]:feats.append('a0_cpt&b0_ne=%s_%s_' % (a0_cpt,b0_ne)) + return feats \ No newline at end of file