Skip to content

Commit

Permalink
add scripts for semeval amr training and eval
Browse files Browse the repository at this point in the history
  • Loading branch information
Juicechuan committed Mar 23, 2016
1 parent 6f255f8 commit 91caa9e
Show file tree
Hide file tree
Showing 23 changed files with 270 additions and 18 deletions.
Binary file added .DS_Store
Binary file not shown.
46 changes: 36 additions & 10 deletions amr_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,16 @@ def main():
arg_parser.add_argument('-s','--start_step',type=int,default=0,help='specify which step to begin oracle testing;for debug')
#arg_parser.add_argument('-i','--input_file',help='the input: preprocessed data instances file for aligner or training')
arg_parser.add_argument('-d','--dev',help='development file')
arg_parser.add_argument('-a','--add',help='additional training file')
arg_parser.add_argument('-as','--actionset',choices=['basic'],default='basic',help='choose different action set')
arg_parser.add_argument('-m','--mode',choices=['preprocess','test_gold_graph','align','userGuide','oracleGuide','train','parse','eval'],help="preprocess:generate pos tag, dependency tree, ner\n" "align:do alignment between AMR graph and sentence string")
arg_parser.add_argument('-dp','--depparser',choices=['stanford','stanfordConvert','stdconv+charniak','clear','mate','turbo'],default='stdconv+charniak',help='choose the dependency parser')
arg_parser.add_argument('--coref',action='store_true',help='flag to enable coreference information')
arg_parser.add_argument('--prop',action='store_true',help='flag to enable semantic role labeling information')
arg_parser.add_argument('--rne',action='store_true',help='flag to enable rich name entity')
arg_parser.add_argument('--onto',action='store_true',help='flag to enable charniak parse result trained on ontonotes')
arg_parser.add_argument('--verblist',action='store_true',help='flag to enable verbalization list')
#arg_parser.add_argument('--onto',action='store_true',help='flag to enable charniak parse result trained on ontonotes')
arg_parser.add_argument('--onto',choices=['onto','onto+bolt','wsj'],default='wsj',help='choose which charniak parse result trained on ontonotes')
arg_parser.add_argument('--model',help='specify the model file')
arg_parser.add_argument('--feat',help='feature template file')
arg_parser.add_argument('-iter','--iterations',default=1,type=int,help='training iterations')
Expand All @@ -156,6 +159,7 @@ def main():
constants.FLAG_COREF=args.coref
constants.FLAG_PROP=args.prop
constants.FLAG_RNE=args.rne
constants.FLAG_VERB=args.verblist
constants.FLAG_ONTO=args.onto
constants.FLAG_DEPPARSER=args.depparser

Expand Down Expand Up @@ -308,9 +312,11 @@ def main():
print "Incorporate Coref Information: %s"%(constants.FLAG_COREF)
print "Incorporate SRL Information: %s"%(constants.FLAG_PROP)
print "Substitue the normal name entity tag with rich name entity tag: %s"%(constants.FLAG_RNE)
print "Using verbalization list: %s"%(constants.FLAG_VERB)
print "Using charniak parser trained on ontonotes: %s"%(constants.FLAG_ONTO)
print "Dependency parser used: %s"%(constants.FLAG_DEPPARSER)
train_instances = preprocess(amr_file,START_SNLP=False)
train_instances = preprocess(amr_file,START_SNLP=False)
if args.add: train_instances = train_instances + preprocess(args.add,START_SNLP=False)
if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False)


Expand All @@ -330,6 +336,11 @@ def main():
model.setup(action_type=args.actionset,instances=train_instances,parser=parser,feature_templates_file=feat_template)

print >> experiment_log, "BEGIN TRAINING!"
best_fscore = 0.0
best_pscore = 0.0
best_rscore = 0.0
best_model = None
best_iter = 1
for iter in xrange(1,args.iterations+1):
print >> experiment_log, "shuffling training instances"
random.shuffle(train_instances)
Expand All @@ -338,22 +349,36 @@ def main():
begin_updates = parser.perceptron.get_num_updates()
parser.parse_corpus_train(train_instances)
parser.perceptron.average_weight()
#model.save_model(args.model+'-iter'+str(iter)+'-'+str(int(time.time()))+'.m')
model.save_model(args.model+'-iter'+str(iter)+'.m')

if args.dev:
print >> experiment_log ,"Result on develop set:"
_,parsed_amr = parser.parse_corpus_test(dev_instances)
write_parsed_amr(parsed_amr,dev_instances,args.dev,args.section+'.'+str(iter)+'.parsed')
parsed_suffix = args.section+'.'+args.model.split('.')[-1]+'.'+str(iter)+'.parsed'
write_parsed_amr(parsed_amr,dev_instances,args.dev,parsed_suffix)
if args.smatcheval:
smatch_path = "./smatch_2.0.2/smatch.py"
python_path = 'python'
options = '--pr -f'
parsed_filename = args.dev+'.'+args.section+'.'+str(iter)+'.parsed'
parsed_filename = args.dev+'.'+parsed_suffix
command = '%s %s %s %s %s' % (python_path, smatch_path, options, parsed_filename, args.dev)

print 'Evaluation using command: ' + (command)
print subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True)

#print subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True)
eval_output = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True)
print eval_output
pscore = float(eval_output.split('\n')[0].split(':')[1].rstrip())
rscore = float(eval_output.split('\n')[1].split(':')[1].rstrip())
fscore = float(eval_output.split('\n')[2].split(':')[1].rstrip())
if fscore > best_fscore:
best_model = model
best_iter = iter
best_fscore = fscore
best_pscore = pscore
best_rscore = rscore

if best_model is not None:
print >> experiment_log, "Best result on iteration %d:\n Precision: %f\n Recall: %f\n F-score: %f" % (best_iter, best_pscore, best_rscore, best_fscore)
best_model.save_model(args.model+'.m')
print >> experiment_log ,"DONE TRAINING!"

elif args.mode == 'parse': # actual parsing
Expand All @@ -369,7 +394,8 @@ def main():
parser = Parser(model=model,oracle_type=DET_T2G_ORACLE_ABT,action_type=args.actionset,verbose=args.verbose,elog=experiment_log)
print >> experiment_log ,"BEGIN PARSING"
span_graph_pairs,results = parser.parse_corpus_test(test_instances)
write_parsed_amr(results,test_instances,amr_file,suffix='%s.parsed'%(args.section))
parsed_suffix = '%s.%s.parsed'%(args.section,args.model.split('.')[-2])
write_parsed_amr(results,test_instances,amr_file,suffix=parsed_suffix)
#write_span_graph(span_graph_pairs,test_instances,amr_file,suffix='spg.50')
################
# for eval #
Expand All @@ -381,7 +407,7 @@ def main():
smatch_path = "./smatch_2.0.2/smatch.py"
python_path = 'python'
options = '--pr -f'
parsed_filename = amr_file+'.'+args.section+'.parsed'
parsed_filename = amr_file+'.'+parsed_suffix
command = '%s %s %s %s %s' % (python_path,smatch_path,options,parsed_filename, amr_file)

print 'Evaluation using command: ' + (command)
Expand Down
3 changes: 2 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
FLAG_COREF=False
FLAG_PROP=False
FLAG_RNE=False
FLAG_VERB=False
FLAG_DEPPARSER='stanford'
FLAG_ONTO=False
FLAG_ONTO='wsj'

# constants
NOT_APPLY='_NOT_APPLY_'
Expand Down
19 changes: 14 additions & 5 deletions preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def readAMREval(eval_file_path):
'''
read in semeval evaluation format (without amr)
'''
eval_file = codecs.open(eval_file_path,'r',encoding='macroman')
eval_file = codecs.open(eval_file_path,'r',encoding='utf-8')
comment_list = []
comment = OrderedDict()
#amr_list = []
Expand Down Expand Up @@ -145,7 +145,11 @@ def _load_cparse(cparse_filename):

def _fix_prop_head(inst,ctree,start_index,height):
head_index = None
tree_pos = ctree.leaf_treeposition(start_index)
try:
tree_pos = ctree.leaf_treeposition(start_index)
except IndexError:
import pdb
pdb.set_trace()
span_root = ctree[tree_pos[:-(height+1)]]
end_index = start_index + len(span_root.leaves())
cur = inst.tokens[start_index+1]
Expand Down Expand Up @@ -330,7 +334,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
if 'alignments' in comments[i]:
alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
# use verbalization list to fix the unaligned tokens
Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment)
if constants.FLAG_VERB: Aligner.postProcessVerbList(amr, comments[i]['tok'], alignment)
#ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
#ggraph.pre_merge_netag(instances[i])
Expand Down Expand Up @@ -419,7 +423,12 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
_add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

elif constants.FLAG_DEPPARSER == "stdconv+charniak":
dep_filename = tok_sent_filename+'.charniak.onto.parse.dep' if constants.FLAG_ONTO else tok_sent_filename+'.charniak.parse.dep'
if constants.FLAG_ONTO == 'onto':
dep_filename = tok_sent_filename+'.charniak.onto.parse.dep'
elif constants.FLAG_ONTO == 'onto+bolt':
dep_filename = tok_sent_filename+'.charniak.onto+bolt.parse.dep'
else:
dep_filename = tok_sent_filename+'.charniak.parse.dep'
if not os.path.exists(dep_filename):
dparser = CharniakParser()
dparser.parse(tok_sent_filename)
Expand Down Expand Up @@ -463,7 +472,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):

if constants.FLAG_PROP:
print >> log, "Adding SRL information..."
prop_filename = tok_sent_filename + '.prop'
prop_filename = tok_sent_filename + '.prop' if constants.FLAG_ONTO != 'onto+bolt' else tok_sent_filename + '.onto+bolt.prop'
if os.path.exists(prop_filename):
if constants.FLAG_DEPPARSER == "stdconv+charniak":
_add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
Expand Down
4 changes: 4 additions & 0 deletions scripts/cmd.test.brown-verb-onto-rne.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Testing ..."
/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --onto onto --rne --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb-onto-rne.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.test.brown-verb-onto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Testing ..."
/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --onto onto --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb-onto.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.test.brown-verb-rne.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Testing ..."
/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --rne --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-rne.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb-rne.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.test.brown-verb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Testing ..."
/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.test.basic-abt-brown-verb.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Testing ..."
/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.basic-abt.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.test.verb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Testing ..."
/usr/bin/python -u amr_parsing.py -m parse --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-verb.m ./data/semeval/test.txt > ./log/amr-semeval-all.test.basic-abt-verb.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.brown-verb-onto+bolt-rne-srl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --prop --rne --onto 'onto+bolt' --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto+bolt-rne-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto+bolt-rne-srl.log 2>&1 &
2 changes: 1 addition & 1 deletion scripts/cmd.train.brown-verb-onto-rne-srl.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --prop --rne --onto --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/test.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl.log 2>&1 &
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --prop --rne --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne-srl.log 2>&1 &
2 changes: 1 addition & 1 deletion scripts/cmd.train.brown-verb-onto-rne.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python amr_parsing.py -m train --amrfmt amr --rne --onto --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne.log 2>&1 &
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --rne --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-rne -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-rne.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.brown-verb-onto-srl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --prop --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto-srl -iter 5 --feat ./feature/basic_abt_srl_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto-srl.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.brown-verb-onto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-onto -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-onto.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.brown-verb-rne.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --verblist --smatcheval --rne --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb-rne -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb-rne.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.brown-verb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python amr_parsing.py -m train --amrfmt amr --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-brown-verb -iter 5 --feat ./feature/basic_abt_brown_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-brown-verb.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.onto-verb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-onto-verb -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-onto-verb.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.onto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --onto onto --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-onto -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-onto.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python amr_parsing.py -m train --amrfmt amr --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt.log 2>&1 &
4 changes: 4 additions & 0 deletions scripts/cmd.train.verb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

echo "Training Model ..."
/usr/bin/python -u amr_parsing.py -m train --amrfmt amr --verblist --smatcheval --model ./models/semeval/amr-semeval-all.train.basic-abt-verb -iter 5 --feat ./feature/basic_abt_feats.templates ./data/semeval/training.txt -d ./data/semeval/dev.txt > ./log/amr-semeval-all.train.basic-abt-verb.log 2>&1 &
34 changes: 34 additions & 0 deletions scripts/fix_multi_sent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

'''
fix charniak parse file with multiple sentences
'''
from nltk.tree import Tree
import sys
import re
import codecs

def fix_multi_sent(line):
tree = Tree.fromstring(line)
if len(tree) > 1:
newtree = Tree('S1',[Tree('S',tree[:])])
else:
newtree = tree
return re.sub('\n\s*',' ',newtree.__str__())


if __name__ == '__main__':
old_parse_file = sys.argv[1]
new_parse_file = old_parse_file.rsplit('.',1)[0]
print '%s >> %s' % (old_parse_file, new_parse_file)
result = []
with codecs.open(old_parse_file,'r',encoding='utf-8') as f:
for line in f:
line = line.strip()
new = fix_multi_sent(line)
result.append(new)

with codecs.open(new_parse_file, 'w', encoding='utf-8') as wf:
wf.write('\n'.join(result))
wf.write('\n')


Loading

0 comments on commit 91caa9e

Please sign in to comment.