This repository has been archived by the owner on Oct 9, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
decode
executable file
·117 lines (103 loc) · 5.71 KB
/
decode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
import optparse
import sys
import models
from collections import namedtuple
optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)")
optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)")
optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]
tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]
# tm should translate unknown tokens by copying them to the output with
# probability 1. This is a sensible strategy when translating between
# languages in Latin script since unknown tokens are often names or numbers.
for word in set(sum(french,())):
if (word,) not in tm:
tm[(word,)] = [models.phrase(word, 0.0)]
########################################################################
# To alter the underlying dynamic program of the decoder, you need only
# modify the state data structure and the three functions below:
# * initial_state()
# * assign_stack(h)
# * extend_state(h,f)
########################################################################
# this data structure is the fundamental object of a dynamic program for
# monotone phrase-based decoding (also known as a semi-Markov model).
# Field i stores the number of words that have been translated (which are
# always the words from 1 to i). Field lm_state stores the language model
# conditioning context that should be used to compute the probability of
# the next English word in the translation.
state = namedtuple("state", "i, lm_state")
# generate an initial hypothesis
def initial_state():
return state(i=0, lm_state=lm.begin())
# determine what stack a hypothesis should be placed in
def assign_stack(s):
return s.i
# Given an input consisting of partial translation state s and
# associated source sentence f, this function should return a list of
# all possible extensions to it. Each extension must be a tuple
# of the form (new_s, logprob, phrase), in which new_s is a new state
# object, and the edge from s to new_s should be labeled by phrase
# with weight logprob.
def extend_state(s, f):
for j in xrange(s.i+1,len(f)+1):
if f[s.i:j] in tm:
for phrase in tm[f[s.i:j]]:
# edge weight includes p_TM
logprob = phrase.logprob
# add p_LM probabilities for every word in phrase.english
lm_state = s.lm_state
for word in phrase.english.split():
(lm_state, word_logprob) = lm.score(lm_state, word)
logprob += word_logprob
# special case: end of sentence
logprob += lm.end(lm_state) if j == len(f) else 0.0
# finally, return the new hypothesis
new_s = state(j, lm_state)
yield (new_s, logprob, phrase)
########################################################################
# End of functions requiring modification
########################################################################
# The following code implements a generic stack decoding algorithm
# that is agnostic to the form of a partial translation state.
# It does however assume that all states in stacks[i] represent
# translations of exactly i source words (though they can be any words).
# It shouldn't be necessary to modify this code if you are only
# changing the dynamic program, but you should understand how it works.
sys.stderr.write("Decoding %s...\n" % (opts.input,))
for f in french:
# a hypothesis is a node in the decoding search graph. It is parameterized
# by a state object, defined above.
hypothesis = namedtuple("hypothesis", "logprob, predecessor, phrase, state")
# create stacks and add initial state
stacks = [{} for _ in f] + [{}] # add stack for case of no words are covered
stacks[0][initial_state()] = hypothesis(0.0, None, None, initial_state())
for stack in stacks[:-1]:
for h in sorted(stack.itervalues(),key=lambda h: -h.logprob)[:opts.s]: # prune
for (new_state, logprob, phrase) in extend_state(h.state, f):
new_h = hypothesis(logprob=h.logprob + logprob,
predecessor=h,
phrase=phrase,
state=new_state)
j = assign_stack(new_state)
if new_state not in stacks[j] or stacks[j][new_state].logprob < new_h.logprob: # second case is recombination
stacks[j][new_state] = new_h
winner = max(stacks[-1].itervalues(), key=lambda h: h.logprob)
def extract_english(h):
return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)
print extract_english(winner)
# optionally report (Viterbi) log probability of best hypothesis
if opts.verbose:
def extract_tm_logprob(h):
return 0.0 if h.predecessor is None else h.phrase.logprob + extract_tm_logprob(h.predecessor)
tm_logprob = extract_tm_logprob(winner)
sys.stderr.write("LM = %f, TM = %f, Total = %f\n" %
(winner.logprob - tm_logprob, tm_logprob, winner.logprob))