Skip to content

Commit

Permalink
update documentary and fix file suffix ambiguity.
Browse files Browse the repository at this point in the history
  • Loading branch information
Juicechuan committed Jul 2, 2015
1 parent 266280e commit a3ffa05
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 10 deletions.
26 changes: 22 additions & 4 deletions amr_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ def main():
arg_parser.add_argument('--feat',help='feature template file')
arg_parser.add_argument('-iter','--iterations',default=1,type=int,help='training iterations')
arg_parser.add_argument('amr_file',nargs='?',help='amr annotation file/input sentence file for parsing')
arg_parser.add_argument('--amrfmt',action='store_true',help='specifying the input file is AMR annotation file')
arg_parser.add_argument('-e','--eval',nargs=2,help='Error Analysis: give parsed AMR file and gold AMR file')
arg_parser.add_argument('--section',choices=['proxy','all'],default='all',help='choose section of the corpus. Only works for LDC2014T12 dataset.')

args = arg_parser.parse_args()

Expand All @@ -154,7 +156,7 @@ def main():

# using corenlp to preprocess the sentences
if args.mode == 'preprocess':
instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=False)
instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=args.amrfmt)
print "Done preprocessing!"
# preprocess the JAMR aligned amr
elif args.mode == 'test_gold_graph':
Expand Down Expand Up @@ -301,8 +303,19 @@ def main():
print "Incorporate Coref Information: %s"%(constants.FLAG_COREF)
print "Incorporate SRL Information: %s"%(constants.FLAG_PROP)
print "Dependency parser used: %s"%(constants.FLAG_DEPPARSER)
train_instances = preprocess(amr_file,START_SNLP=False)
train_instances = preprocess(amr_file,START_SNLP=False)
if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False)


if args.section != 'all':
print "Choosing corpus section: %s"%(args.section)
tcr = constants.get_corpus_range(args.section,'train')
train_instances = train_instances[tcr[0]:tcr[1]]
if args.dev:
dcr = constants.get_corpus_range(args.section,'dev')
dev_instances = dev_instances[dcr[0]:dcr[1]]


feat_template = args.feat if args.feat else None
model = Model(elog=experiment_log)
#model.output_feature_generator()
Expand All @@ -323,19 +336,24 @@ def main():
if args.dev:
print >> experiment_log ,"Result on develop set:"
_,parsed_amr = parser.parse_corpus_test(dev_instances)
write_parsed_amr(parsed_amr,dev_instances,args.dev,str(iter)+'.parsed')
write_parsed_amr(parsed_amr,dev_instances,args.dev,args.section+'.'+str(iter)+'.parsed')

print >> experiment_log ,"DONE TRAINING!"

elif args.mode == 'parse': # actual parsing
test_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=False)
if args.section != 'all':
print "Choosing corpus section: %s"%(args.section)
tcr = constants.get_corpus_range(args.section,'test')
test_instances = test_instances[tcr[0]:tcr[1]]

#random.shuffle(test_instances)
print >> experiment_log, "Loading model: ", args.model
model = Model.load_model(args.model)
parser = Parser(model=model,oracle_type=DET_T2G_ORACLE_ABT,action_type=args.actionset,verbose=args.verbose,elog=experiment_log)
print >> experiment_log ,"BEGIN PARSING"
span_graph_pairs,results = parser.parse_corpus_test(test_instances)
write_parsed_amr(results,test_instances,amr_file,suffix='parsed')
write_parsed_amr(results,test_instances,amr_file,suffix='%s.parsed'%(args.section))
#write_span_graph(span_graph_pairs,test_instances,amr_file,suffix='spg.50')
################
# for eval #
Expand Down
30 changes: 29 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,32 @@ def _load_brown_cluster(dir_path,cluster_num=1000):

return cluster_dict

BROWN_CLUSTER=_load_brown_cluster(DEFAULT_BROWN_CLUSTER)
BROWN_CLUSTER=_load_brown_cluster(DEFAULT_BROWN_CLUSTER)

# given different domain, return range of split corpus #TODO: move this part to config file
def get_corpus_range(corpus_section,corpus_type):
DOMAIN_RANGE_TABLE={ \
'train':{
'proxy':(0,6603),
'bolt':(6603,7664),
'dfa':(7664,9367),
'mt09sdf':(9367,9571),
'xinhua':(9571,10312)
},
'dev':{
'proxy':(0,826),
'bolt':(826,959),
'consensus':(959,1059),
'dfa':(1059,1269),
'xinhua':(1269,1368)
},
'test':{
'proxy':(0,823),
'bolt':(823,956),
'consensus':(956,1056),
'dfa':(1056,1285),
'xinhua':(1285,1371)
}
}

return DOMAIN_RANGE_TABLE[corpus_type][corpus_section]
6 changes: 3 additions & 3 deletions preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
if INPUT_AMR: # the input file is amr annotation

amr_file = input_file
aligned_amr_file = amr_file + '.aligned'
aligned_amr_file = amr_file + '.amr.tok.aligned'
if os.path.exists(aligned_amr_file):
comments,amr_strings = readAMR(aligned_amr_file)
else:
Expand All @@ -261,7 +261,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
if not os.path.exists(tok_sent_filename):
_write_tok_sentences(tok_sent_filename,instances)

tok_amr_filename = amr_file + '.tok'
tok_amr_filename = amr_file + '.amr.tok'
if not os.path.exists(tok_amr_filename): # write tokenized amr file
_write_tok_amr(tok_amr_filename,amr_file,instances)

Expand Down Expand Up @@ -362,7 +362,7 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR=True):
pass

if constants.FLAG_PROP:
print >> log, "adding SRL information..."
print >> log, "Adding SRL information..."
prop_filename = tok_sent_filename + '.prop'
if os.path.exists(prop_filename):
if constants.FLAG_DEPPARSER == "stdconv+charniak":
Expand Down
4 changes: 2 additions & 2 deletions scripts/jamr_align.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ ${JAMR_HOME}/scripts/config.sh
#### Align the tokenized amr file ####

echo "### Aligning $1 ###"

${JAMR_HOME}/run Aligner -v 0 < $1.tok > $1.aligned
# input should be tokenized AMR file, which has :tok tag in the comments
${JAMR_HOME}/run Aligner -v 0 < $1 > $1.aligned
150 changes: 150 additions & 0 deletions temp/feats_gen_basic_abt_srl_brown_feats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#generated by model.py
from constants import *
def generate_features(state,action):
s0,b0,a0=state.get_feature_context_window(action)
feats=[]
act_idx = state.model.class_codebook.get_index(action['type'])
tx = action['tag'] if 'tag' in action else EMPTY
s0_ne=s0['ne'] if s0 else EMPTY
s0_w=s0['form'] if s0 else EMPTY
s0_lemma=s0['lemma'] if s0 else EMPTY
s0_t=s0['pos'] if s0 else EMPTY
s0_dl=s0['rel'] if s0 else EMPTY
s0_len=s0['len'] if s0 else EMPTY
s0_txv=s0['txv'] if s0 else EMPTY
s0_brown4=s0['brown4'] if s0 else EMPTY
s0_brown6=s0['brown6'] if s0 else EMPTY
s0_brown10=s0['brown10'] if s0 else EMPTY
s0_brown20=s0['brown20'] if s0 else EMPTY
s0_frmset=s0['frmset'] if s0 else EMPTY
s0_eqfrmset=s0['eqfrmset'] if s0 else EMPTY
b0_isarg=b0['isarg'] if b0 else EMPTY
b0_arglabel=b0['arglabel'] if b0 else EMPTY
b0_dl=b0['rel'] if b0 else EMPTY
a0_isprd=a0['isprd'] if a0 else EMPTY
a0_prdlabel=a0['prdlabel'] if a0 else EMPTY
b0_isprd=b0['isprd'] if b0 else EMPTY
b0_prdlabel=b0['prdlabel'] if b0 else EMPTY
a0_isarg=a0['isarg'] if a0 else EMPTY
a0_arglabel=a0['arglabel'] if a0 else EMPTY
s0_isnom=s0['isnom'] if s0 else EMPTY
s0_nech=s0['nech'] if s0 else EMPTY
s0_c1lemma=s0['c1lemma'] if s0 else EMPTY
s0_c1dl=s0['c1dl'] if s0 else EMPTY
s0_cpt=s0['concept'] if s0 else EMPTY
s0_p1_ne=s0['p1']['ne'] if s0 and s0['p1'] else EMPTY
b0_ne=b0['ne'] if b0 else EMPTY
b0_w=b0['form'] if b0 else EMPTY
b0_lemma=b0['lemma'] if b0 else EMPTY
b0_t=b0['pos'] if b0 else EMPTY
b0_len=b0['len'] if b0 else EMPTY
b0_reph=b0['reph'] if b0 else EMPTY
b0_brown4=b0['brown4'] if b0 else EMPTY
b0_brown6=b0['brown6'] if b0 else EMPTY
b0_brown10=b0['brown10'] if b0 else EMPTY
b0_brown20=b0['brown20'] if b0 else EMPTY
a0_ne=a0['ne'] if a0 else EMPTY
a0_w=a0['form'] if a0 else EMPTY
a0_lemma=a0['lemma'] if a0 else EMPTY
a0_t=a0['pos'] if a0 else EMPTY
a0_dl=a0['rel'] if a0 else EMPTY
a0_brown4=a0['brown4'] if a0 else EMPTY
a0_brown6=a0['brown6'] if a0 else EMPTY
a0_brown10=a0['brown10'] if a0 else EMPTY
a0_brown20=a0['brown20'] if a0 else EMPTY
s0_p1_w=s0['p1']['form'] if s0 and s0['p1'] else EMPTY
s0_p1_lemma=s0['p1']['lemma'] if s0 and s0['p1'] else EMPTY
s0_p1_t=s0['p1']['pos'] if s0 and s0['p1'] else EMPTY
s0_p1_dl=s0['p1']['rel'] if s0 and s0['p1'] else EMPTY
b0_pathpwd=b0['pathpwd'] if b0 else EMPTY
b0_apathpwd=b0['apathpwd'] if b0 else EMPTY
b0_rsb_dl=b0['rsb']['rel'] if b0 and b0['rsb'] else EMPTY
b0_nswp=b0['nswp'] if b0 else EMPTY
b0_pathp=b0['pathp'] if b0 else EMPTY
b0_apathp=b0['apathp'] if b0 else EMPTY
a0_cpt=a0['concept'] if a0 else EMPTY
b0_cpt=b0['concept'] if b0 else EMPTY
dist1=abs(s0['id']-b0['id']) if b0 and b0 is not ABT_TOKEN and s0 is not ABT_TOKEN else EMPTY
if dist1 > 10: dist1=10
dist2=abs(a0['id']-b0['id']) if b0 and a0 and b0 is not ABT_TOKEN and a0 is not ABT_TOKEN else EMPTY
if dist2 > 10: dist2=10
if [s0_ne,tx] != 2*[None]:feats.append('s0_ne&tx=%s_%s_' % (s0_ne,tx))
if [s0_w,tx] != 2*[None]:feats.append('s0_w&tx=%s_%s_' % (s0_w,tx))
if [s0_lemma,tx] != 2*[None]:feats.append('s0_lemma&tx=%s_%s_' % (s0_lemma,tx))
if [s0_t,tx] != 2*[None]:feats.append('s0_t&tx=%s_%s_' % (s0_t,tx))
if [s0_dl,tx] != 2*[None]:feats.append('s0_dl&tx=%s_%s_' % (s0_dl,tx))
if [s0_len,tx] != 2*[None]:feats.append('s0_len&tx=%s_%s_' % (s0_len,tx))
if [s0_t,s0_txv] != 2*[None]:feats.append('s0_t&s0_txv=%s_%s_' % (s0_t,s0_txv))
if [s0_brown4] != 1*[None]:feats.append('s0_brown4=%s_' % (s0_brown4))
if [s0_brown6] != 1*[None]:feats.append('s0_brown6=%s_' % (s0_brown6))
if [s0_brown10] != 1*[None]:feats.append('s0_brown10=%s_' % (s0_brown10))
if [s0_brown20] != 1*[None]:feats.append('s0_brown20=%s_' % (s0_brown20))
if [s0_frmset] != 1*[None]:feats.append('s0_frmset=%s_' % (s0_frmset))
if [s0_eqfrmset] != 1*[None]:feats.append('s0_eqfrmset=%s_' % (s0_eqfrmset))
if [b0_isarg] != 1*[None]:feats.append('b0_isarg=%s_' % (b0_isarg))
if [b0_arglabel] != 1*[None]:feats.append('b0_arglabel=%s_' % (b0_arglabel))
if [b0_dl,b0_arglabel] != 2*[None]:feats.append('b0_dl&b0_arglabel=%s_%s_' % (b0_dl,b0_arglabel))
if [a0_isprd] != 1*[None]:feats.append('a0_isprd=%s_' % (a0_isprd))
if [a0_prdlabel] != 1*[None]:feats.append('a0_prdlabel=%s_' % (a0_prdlabel))
if [b0_isprd] != 1*[None]:feats.append('b0_isprd=%s_' % (b0_isprd))
if [b0_prdlabel] != 1*[None]:feats.append('b0_prdlabel=%s_' % (b0_prdlabel))
if [a0_isarg] != 1*[None]:feats.append('a0_isarg=%s_' % (a0_isarg))
if [a0_arglabel] != 1*[None]:feats.append('a0_arglabel=%s_' % (a0_arglabel))
if [s0_isnom] != 1*[None]:feats.append('s0_isnom=%s_' % (s0_isnom))
if [s0_nech] != 1*[None]:feats.append('s0_nech=%s_' % (s0_nech))
if [s0_lemma,s0_nech] != 2*[None]:feats.append('s0_lemma&s0_nech=%s_%s_' % (s0_lemma,s0_nech))
if [s0_isnom,s0_nech] != 2*[None]:feats.append('s0_isnom&s0_nech=%s_%s_' % (s0_isnom,s0_nech))
if [s0_c1lemma,tx] != 2*[None]:feats.append('s0_c1lemma&tx=%s_%s_' % (s0_c1lemma,tx))
if [s0_c1dl,s0_c1lemma] != 2*[None]:feats.append('s0_c1dl&s0_c1lemma=%s_%s_' % (s0_c1dl,s0_c1lemma))
if [s0_cpt,s0_p1_ne,s0_c1lemma] != 3*[None]:feats.append('s0_cpt&s0_p1_ne&s0_c1lemma=%s_%s_%s_' % (s0_cpt,s0_p1_ne,s0_c1lemma))
if [b0_ne] != 1*[None]:feats.append('b0_ne=%s_' % (b0_ne))
if [b0_w] != 1*[None]:feats.append('b0_w=%s_' % (b0_w))
if [b0_lemma] != 1*[None]:feats.append('b0_lemma=%s_' % (b0_lemma))
if [b0_t] != 1*[None]:feats.append('b0_t=%s_' % (b0_t))
if [b0_dl] != 1*[None]:feats.append('b0_dl=%s_' % (b0_dl))
if [b0_len] != 1*[None]:feats.append('b0_len=%s_' % (b0_len))
if [b0_reph] != 1*[None]:feats.append('b0_reph=%s_' % (b0_reph))
if [b0_brown4] != 1*[None]:feats.append('b0_brown4=%s_' % (b0_brown4))
if [b0_brown6] != 1*[None]:feats.append('b0_brown6=%s_' % (b0_brown6))
if [b0_brown10] != 1*[None]:feats.append('b0_brown10=%s_' % (b0_brown10))
if [b0_brown20] != 1*[None]:feats.append('b0_brown20=%s_' % (b0_brown20))
if [a0_ne] != 1*[None]:feats.append('a0_ne=%s_' % (a0_ne))
if [a0_w] != 1*[None]:feats.append('a0_w=%s_' % (a0_w))
if [a0_lemma] != 1*[None]:feats.append('a0_lemma=%s_' % (a0_lemma))
if [a0_t] != 1*[None]:feats.append('a0_t=%s_' % (a0_t))
if [a0_dl] != 1*[None]:feats.append('a0_dl=%s_' % (a0_dl))
if [a0_brown4] != 1*[None]:feats.append('a0_brown4=%s_' % (a0_brown4))
if [a0_brown6] != 1*[None]:feats.append('a0_brown6=%s_' % (a0_brown6))
if [a0_brown10] != 1*[None]:feats.append('a0_brown10=%s_' % (a0_brown10))
if [a0_brown20] != 1*[None]:feats.append('a0_brown20=%s_' % (a0_brown20))
if [s0_p1_ne] != 1*[None]:feats.append('s0_p1_ne=%s_' % (s0_p1_ne))
if [s0_p1_w] != 1*[None]:feats.append('s0_p1_w=%s_' % (s0_p1_w))
if [s0_p1_lemma] != 1*[None]:feats.append('s0_p1_lemma=%s_' % (s0_p1_lemma))
if [s0_p1_t] != 1*[None]:feats.append('s0_p1_t=%s_' % (s0_p1_t))
if [s0_p1_dl] != 1*[None]:feats.append('s0_p1_dl=%s_' % (s0_p1_dl))
if [b0_pathpwd,b0_lemma,s0_lemma] != 3*[None]:feats.append('b0_pathpwd&b0_lemma&s0_lemma=%s_%s_%s_' % (b0_pathpwd,b0_lemma,s0_lemma))
if [b0_apathpwd,a0_lemma,b0_lemma] != 3*[None]:feats.append('b0_apathpwd&a0_lemma&b0_lemma=%s_%s_%s_' % (b0_apathpwd,a0_lemma,b0_lemma))
if [b0_pathpwd] != 1*[None]:feats.append('b0_pathpwd=%s_' % (b0_pathpwd))
if [b0_apathpwd] != 1*[None]:feats.append('b0_apathpwd=%s_' % (b0_apathpwd))
if [b0_lemma,b0_rsb_dl] != 2*[None]:feats.append('b0_lemma&b0_rsb_dl=%s_%s_' % (b0_lemma,b0_rsb_dl))
if [b0_lemma,b0_nswp] != 2*[None]:feats.append('b0_lemma&b0_nswp=%s_%s_' % (b0_lemma,b0_nswp))
if [dist1] != 1*[None]:feats.append('dist1=%s_' % (dist1))
if [dist1,b0_pathp] != 2*[None]:feats.append('dist1&b0_pathp=%s_%s_' % (dist1,b0_pathp))
if [dist2] != 1*[None]:feats.append('dist2=%s_' % (dist2))
if [dist2,b0_apathp] != 2*[None]:feats.append('dist2&b0_apathp=%s_%s_' % (dist2,b0_apathp))
if [s0_lemma,b0_t] != 2*[None]:feats.append('s0_lemma&b0_t=%s_%s_' % (s0_lemma,b0_t))
if [s0_lemma,b0_dl] != 2*[None]:feats.append('s0_lemma&b0_dl=%s_%s_' % (s0_lemma,b0_dl))
if [s0_t,b0_lemma] != 2*[None]:feats.append('s0_t&b0_lemma=%s_%s_' % (s0_t,b0_lemma))
if [s0_dl,b0_lemma] != 2*[None]:feats.append('s0_dl&b0_lemma=%s_%s_' % (s0_dl,b0_lemma))
if [s0_ne,b0_ne] != 2*[None]:feats.append('s0_ne&b0_ne=%s_%s_' % (s0_ne,b0_ne))
if [s0_brown20,b0_brown4] != 2*[None]:feats.append('s0_brown20&b0_brown4=%s_%s_' % (s0_brown20,b0_brown4))
if [s0_brown20,b0_dl] != 2*[None]:feats.append('s0_brown20&b0_dl=%s_%s_' % (s0_brown20,b0_dl))
if [s0_brown4,b0_brown20] != 2*[None]:feats.append('s0_brown4&b0_brown20=%s_%s_' % (s0_brown4,b0_brown20))
if [s0_dl,b0_brown20] != 2*[None]:feats.append('s0_dl&b0_brown20=%s_%s_' % (s0_dl,b0_brown20))
if [a0_t,b0_lemma] != 2*[None]:feats.append('a0_t&b0_lemma=%s_%s_' % (a0_t,b0_lemma))
if [a0_dl,b0_lemma] != 2*[None]:feats.append('a0_dl&b0_lemma=%s_%s_' % (a0_dl,b0_lemma))
if [a0_ne,b0_ne] != 2*[None]:feats.append('a0_ne&b0_ne=%s_%s_' % (a0_ne,b0_ne))
if [a0_cpt,b0_cpt] != 2*[None]:feats.append('a0_cpt&b0_cpt=%s_%s_' % (a0_cpt,b0_cpt))
if [a0_cpt,b0_cpt,s0_lemma] != 3*[None]:feats.append('a0_cpt&b0_cpt&s0_lemma=%s_%s_%s_' % (a0_cpt,b0_cpt,s0_lemma))
if [a0_cpt,b0_ne] != 2*[None]:feats.append('a0_cpt&b0_ne=%s_%s_' % (a0_cpt,b0_ne))
return feats

0 comments on commit a3ffa05

Please sign in to comment.