diff --git a/fluid/rnn_beam_search/attention_seq2seq.py b/fluid/rnn_beam_search/attention_seq2seq.py new file mode 100644 index 0000000000..c9d71bcec3 --- /dev/null +++ b/fluid/rnn_beam_search/attention_seq2seq.py @@ -0,0 +1,458 @@ +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle.v2 +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor +from beam_search_api import * + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=16, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=False, + help="Whether to use gpu. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32', + param_attr=fluid.ParamAttr(name='src_embedding')) + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + h = InitState(init=decoder_boot, need_reorder=True) + c = InitState(init=cell_init) + + state_cell = StateCell( + cell_size=decoder_size, + inputs={'x': None, + 'encoder_vec': None, + 'encoder_proj': None}, + states={'h': h, + 'c': c}) + + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + # concated lod should inherit from encoder_proj + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + @state_cell.state_updater + def state_updater(state_cell): + current_word = state_cell.get_input('x') + encoder_vec = state_cell.get_input('encoder_vec') + encoder_proj = state_cell.get_input('encoder_proj') + prev_h = state_cell.get_state('h') + prev_c = state_cell.get_state('c') + context = simple_attention(encoder_vec, encoder_proj, prev_h) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, prev_h, prev_c, decoder_size) + state_cell.set_state('h', h) + state_cell.set_state('c', c) + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32', + param_attr=fluid.ParamAttr('trg_embedding')) + + decoder = TrainingDecoder(state_cell) + + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + encoder_vec = decoder.static_input(encoded_vector) + encoder_proj = decoder.static_input(encoded_proj) + decoder.state_cell.compute_state(inputs={ + 'x': current_word, + 'encoder_vec': encoder_vec, + 'encoder_proj': encoder_proj + }) + h = decoder.state_cell.get_state('h') + decoder.state_cell.update_states() + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + decoder.output(out) + + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=decoder(), label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + else: + init_ids = fluid.layers.data( + name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = fluid.layers.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + def embedding(input): + return fluid.layers.embedding( + input=input, + size=[target_dict_dim, embedding_dim], + dtype='float32', + param_attr=fluid.ParamAttr('trg_embedding')) + + decoder = BeamSearchDecoder(state_cell, max_len=max_length) + + with decoder.block(): + encoder_vec = decoder.read_array(init=encoded_vector) + encoder_proj = decoder.read_array(init=encoded_proj) + prev_ids = decoder.read_array(init=init_ids, is_ids=True) + prev_scores = decoder.read_array(init=init_scores, is_scores=True) + prev_ids_embedding = embedding(prev_ids) + prev_h = decoder.state_cell.get_state('h') + prev_c = decoder.state_cell.get_state('c') + prev_h_expanded = fluid.layers.sequence_expand(prev_h, prev_scores) + prev_c_expanded = fluid.layers.sequence_expand(prev_c, prev_scores) + encoder_vec_expanded = fluid.layers.sequence_expand(encoder_vec, + prev_scores) + encoder_proj_expanded = fluid.layers.sequence_expand(encoder_proj, + prev_scores) + decoder.state_cell.set_state('h', prev_h_expanded) + decoder.state_cell.set_state('c', prev_c_expanded) + decoder.state_cell.compute_state(inputs={ + 'x': prev_ids_embedding, + 'encoder_vec': encoder_vec_expanded, + 'encoder_proj': encoder_proj_expanded + }) + current_state = decoder.state_cell.get_state('h') + current_state_with_lod = fluid.layers.lod_reset( + x=current_state, y=prev_scores) + scores = fluid.layers.fc(input=current_state_with_lod, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = fluid.layers.topk(scores, k=beam_size) + selected_ids, selected_scores = fluid.layers.beam_search( + prev_ids, + topk_indices, + topk_scores, + beam_size, + end_id=1, + level=0) + decoder.state_cell.update_states() + decoder.update_array(prev_ids, selected_ids) + decoder.update_array(prev_scores, selected_scores) + decoder.update_array(encoder_vec, encoder_vec_expanded) + decoder.update_array(encoder_proj, encoder_proj_expanded) + + translation_ids, translation_scores = decoder() + + feeding_list = ["source_sequence", "init_ids", "init_scores"] + + return translation_ids, translation_scores, feeding_list + + +def to_lodtensor(data, place, dtype='int64'): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype(dtype) + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program(), print_log=False) + + train_batch_generator = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_val)) + + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + translation_ids, translation_scores, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + True, + beam_size=args.beam_size, + max_length=args.max_length) + + fluid.memory_optimize(fluid.default_main_program(), print_log=False) + + test_batch_generator = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + for batch_id, data in enumerate(test_batch_generator()): + batch_size = len(data) + src_seq, _ = to_lodtensor(map(lambda x: x[0], data), place) + init_ids, _ = to_lodtensor([[0] for _ in xrange(batch_size)], place) + init_ids.set_lod(init_ids.lod() + [init_ids.lod()[-1]]) + init_scores, _ = to_lodtensor([[1.0] for _ in xrange(batch_size)], + place, 'float32') + init_scores.set_lod(init_scores.lod() + [init_scores.lod()[-1]]) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: init_ids, + feeding_list[2]: init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + + print(fetch_outs[0].lod()) + break + + +if __name__ == '__main__': + args = parser.parse_args() + if args.infer_only: + infer() + else: + train() diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py new file mode 100644 index 0000000000..78489d365c --- /dev/null +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -0,0 +1,403 @@ +import contextlib +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.framework import Variable +from paddle.fluid import framework, unique_name +from paddle.fluid.layer_helper import LayerHelper +import paddle.fluid.core as core + + +class DecoderType: + TRAINING = 1 + BEAM_SEARCH = 2 + + +class InitState(object): + def __init__(self, + init=None, + shape=None, + value=0.0, + need_reorder=False, + dtype='float32'): + self._init = init + self._shape = shape + self._value = value + self._need_reorder = need_reorder + self._dtype = dtype + + @property + def value(self): + return self._init # may create a LoDTensor + + @property + def need_reorder(self): + return self._need_reorder + + +class MemoryState(object): + def __init__(self, state_name, rnn_obj, init_state): + self._state_name = state_name # each is a rnn.memory + self._rnn_obj = rnn_obj + self._state_mem = self._rnn_obj.memory( + init=init_state.value, need_reorder=init_state.need_reorder) + + def get_state(self): + return self._state_mem + + def update_state(self, state): + self._rnn_obj.update_memory(self._state_mem, state) + + +class ArrayState(object): + def __init__(self, state_name, block, init_state): + self._state_name = state_name + self._block = block + + self._state_array = self._block.create_var( + name=unique_name.generate('array_state_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init_state.value.dtype) + + self._counter = self._block.create_var( + name=unique_name.generate('array_state_counter'), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='int64') + + # initialize counter + self._block.append_op( + type='fill_constant', + inputs={}, + outputs={'Out': [self._counter]}, + attrs={ + 'shape': [1], + 'dtype': self._counter.dtype, + 'value': float(0.0), + 'force_cpu': True + }) + + self._counter.stop_gradient = True + + # write initial state + block.append_op( + type='write_to_array', + inputs={'X': init_state.value, + 'I': self._counter}, + outputs={'Out': self._state_array}) + + def get_state(self): + state = layers.array_read(array=self._state_array, i=self._counter) + return state + + def update_state(self, state): + layers.increment(x=self._counter, value=1, in_place=True) + layers.array_write(state, array=self._state_array, i=self._counter) + + +class StateCell(object): + def __init__(self, cell_size, inputs, states, name=None): + self._helper = LayerHelper('state_cell', name=name) + self._cur_states = {} + self._state_names = [] + self._states_holder = {} + for state_name, state in states.items(): + if not isinstance(state, InitState): + raise ValueError('state must be an InitState object.') + self._cur_states[state_name] = state + self._state_names.append(state_name) + self._inputs = inputs # inputs is place holder here + self._cur_decoder_obj = None + self._in_decoder = False + self._states_holder = {} + self._switched_decoder = False + self._state_updater = None + + def enter_decoder(self, decoder_obj): + if self._in_decoder == True or self._cur_decoder_obj is not None: + raise ValueError('StateCell has already entered a decoder.') + self._in_decoder = True + self._cur_decoder_obj = decoder_obj + self._switched_decoder = False + + def _switch_decoder(self): # lazy switch + if self._in_decoder == False: + raise ValueError('StateCell must be enter a decoder.') + + if self._switched_decoder == True: + raise ValueError('StateCell already done switching.') + + for state_name in self._state_names: + if state_name not in self._states_holder: + state = self._cur_states[state_name] + + if not isinstance(state, InitState): + raise ValueError('Current type of state is %s, should be ' + 'an InitState object.' % type(state)) + + self._states_holder[state_name] = {} + + if self._cur_decoder_obj.type == DecoderType.TRAINING: + self._states_holder[state_name][id(self._cur_decoder_obj)] \ + = MemoryState(state_name, + self._cur_decoder_obj.dynamic_rnn, + state) + elif self._cur_decoder_obj.type == DecoderType.BEAM_SEARCH: + self._states_holder[state_name][id(self._cur_decoder_obj)] \ + = ArrayState(state_name, + self._cur_decoder_obj.parent_block(), + state) + else: + raise ValueError('Unknown decoder type, only support ' + '[TRAINING, BEAM_SEARCH]') + + # Read back, since current state should be LoDTensor + self._cur_states[state_name] = \ + self._states_holder[state_name][id(self._cur_decoder_obj)].get_state() + + self._switched_decoder = True + + def get_state(self, state_name): + if self._in_decoder and not self._switched_decoder: + self._switch_decoder() + + if state_name not in self._cur_states: + raise ValueError( + 'Unknown state %s. Please make sure _switch_decoder() ' + 'invoked.' % state_name) + + return self._cur_states[state_name] + + def get_input(self, input_name): + if input_name not in self._inputs or self._inputs[input_name] is None: + raise ValueError('Invalid input %s.' % input_name) + return self._inputs[input_name] + + def set_state(self, state_name, state_value): + self._cur_states[state_name] = state_value + + def state_updater(self, updater): + self._state_updater = updater + + def _decorator(state_cell): + if state_cell == self: + raise TypeError('Updater should only accept a StateCell object ' + 'as argument.') + updater(state_cell) + + return _decorator + + def compute_state(self, inputs): + if self._in_decoder and not self._switched_decoder: + self._switch_decoder() + + for input_name, input_value in inputs.items(): + if input_name not in self._inputs: + raise ValueError('Unknown input %s. ' + 'Please make sure %s in input ' + 'place holder.' % (input_name, input_name)) + self._inputs[input_name] = input_value + self._state_updater(self) + + def update_states(self): + if self._in_decoder and not self._switched_decoder: + self._switched_decoder() + + for state_name, decoder_state in self._states_holder.items(): + if id(self._cur_decoder_obj) not in decoder_state: + raise ValueError('Unknown decoder object, please make sure ' + 'switch_decoder been invoked.') + decoder_state[id(self._cur_decoder_obj)].update_state( + self._cur_states[state_name]) + + def leave_decoder(self, decoder_obj): + if self._in_decoder == False: + raise ValueError('StateCell not in decoder, ' + 'invlid leaving operation.') + + if self._cur_decoder_obj != decoder_obj: + raise ValueError('Inconsist decoder object in StateCell.') + + self._in_decoder = False + self._cur_decoder_obj = None + self._switched_decoder = False + + +class TrainingDecoder(object): + BEFORE_DECODER = 0 + IN_DECODER = 1 + AFTER_DECODER = 2 + + def __init__(self, state_cell, name=None): + self._helper = LayerHelper('training_decoder', name=name) + self._status = TrainingDecoder.BEFORE_DECODER + self._dynamic_rnn = layers.DynamicRNN() + self._type = DecoderType.TRAINING + self._state_cell = state_cell + self._state_cell.enter_decoder(self) + + @contextlib.contextmanager + def block(self): + if self._status != TrainingDecoder.BEFORE_DECODER: + raise ValueError('decoder.block() can only be invoked once') + self._status = TrainingDecoder.IN_DECODER + with self._dynamic_rnn.block(): + yield + self._status = TrainingDecoder.AFTER_DECODER + self._state_cell.leave_decoder(self) + + @property + def state_cell(self): + self._assert_in_decoder_block('state_cell') + return self._state_cell + + @property + def dynamic_rnn(self): + return self._dynamic_rnn + + @property + def type(self): + return self._type + + def step_input(self, x): + self._assert_in_decoder_block('step_input') + return self._dynamic_rnn.step_input(x) + + def static_input(self, x): + self._assert_in_decoder_block('static_input') + return self._dynamic_rnn.static_input(x) + + def __call__(self, *args, **kwargs): + if self._status != TrainingDecoder.AFTER_DECODER: + raise ValueError('Output of training decoder can only be visited ' + 'outside the block.') + return self._dynamic_rnn(*args, **kwargs) + + def output(self, *outputs): + self._assert_in_decoder_block('output') + self._dynamic_rnn.output(*outputs) + + def _assert_in_decoder_block(self, method): + if self._status != TrainingDecoder.IN_DECODER: + raise ValueError('%s should be invoked inside block of ' + 'TrainingDecoder object.' % method) + + +class BeamSearchDecoder(object): + BEFORE_BEAM_SEARCH_DECODER = 0 + IN_BEAM_SEARCH_DECODER = 1 + AFTER_BEAM_SEARCH_DECODER = 2 + + def __init__(self, state_cell, max_len, name=None): + self._helper = LayerHelper('beam_search_decoder', name=name) + self._counter = layers.zeros(shape=[1], dtype='int64') + self._counter.stop_gradient = True + self._type = DecoderType.BEAM_SEARCH + self._max_len = layers.fill_constant( + shape=[1], dtype='int64', value=max_len) + self._cond = layers.less_than( + x=self._counter, + y=layers.fill_constant( + shape=[1], dtype='int64', value=max_len)) + self._while_op = layers.While(self._cond) + self._state_cell = state_cell + self._state_cell.enter_decoder(self) + self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER + self._zero_idx = layers.fill_constant( + shape=[1], value=0, dtype='int64', force_cpu=True) + self._array_dict = {} + self._array_link = [] + self._ids_array = None + self._scores_array = None + + @contextlib.contextmanager + def block(self): + if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER: + raise ValueError('block() can only be invoke once.') + + self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER + + with self._while_op.block(): + yield + + layers.increment(x=self._counter, value=1.0, in_place=True) + + for value, array in self._array_link: + layers.array_write(x=value, i=self._counter, array=array) + + layers.less_than(x=self._counter, y=self._max_len, cond=self._cond) + + self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER + self._state_cell.leave_decoder(self) + + @property + def type(self): + return self._type + + # init must be provided + def read_array(self, init, is_ids=False, is_scores=False): + self._assert_in_decoder_block('read_array') + + if is_ids == True and is_scores == True: + raise ValueError('Shouldn\'t mark current array be ids array and' + 'scores array at the same time.') + + if not isinstance(init, Variable): + raise TypeError('The input argument `init` must be a Variable.') + + parent_block = self.parent_block() + array = parent_block.create_var( + name=unique_name.generate('beam_search_decoder_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init.dtype) + parent_block.append_op( + type='write_to_array', + inputs={'X': init, + 'I': self._zero_idx}, + outputs={'Out': array}) + + if is_ids == True: + self._ids_array = array + elif is_scores == True: + self._scores_array = array + + read_value = layers.array_read(array=array, i=self._counter) + self._array_dict[read_value.name] = array + return read_value + + def update_array(self, array, value): + self._assert_in_decoder_block('update_array') + + if not isinstance(array, Variable): + raise TypeError( + 'The input argument `array` of must be a Variable.') + if not isinstance(value, Variable): + raise TypeError('The input argument `value` of must be a Variable.') + + array = self._array_dict.get(array.name, None) + if array is None: + raise ValueError('Please invoke read_array before update_array.') + self._array_link.append((value, array)) + + def __call__(self): + if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER: + raise ValueError('Output of BeamSearchDecoder object can ' + 'only be visited outside the block.') + return layers.beam_search_decode( + ids=self._ids_array, scores=self._scores_array) + + @property + def state_cell(self): + self._assert_in_decoder_block('state_cell') + return self._state_cell + + def parent_block(self): + program = self._helper.main_program + parent_block_idx = program.current_block().parent_idx + if parent_block_idx < 0: + raise ValueError('Invlid block with index %d.' % parent_block_idx) + parent_block = program.block(parent_block_idx) + return parent_block + + def _assert_in_decoder_block(self, method): + if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER: + raise ValueError('%s should be invoked inside block of ' + 'BeamSearchDecoder object.' % method) diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py new file mode 100644 index 0000000000..c83922e609 --- /dev/null +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -0,0 +1,245 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.v2 +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +import paddle.fluid.layers as pd +from paddle.fluid.executor import Executor +from beam_search_api import * + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.v2.dataset.wmt14.get_dict(dict_size) +hidden_dim = 32 +word_dim = 16 +IS_SPARSE = True +batch_size = 2 +max_length = 8 +topk_size = 50 +trg_dic_size = 10000 +beam_size = 2 + +decoder_size = hidden_dim + +place = core.CPUPlace() + + +def encoder(): + # encoder + src_word_id = pd.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = pd.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + + +def decoder_state_cell(context): + h = InitState(init=context) + state_cell = StateCell( + cell_size=decoder_size, inputs={'x': None}, states={'h': h}) + + @state_cell.state_updater + def updater(state_cell): + current_word = state_cell.get_input('x') + prev_h = state_cell.get_state('h') + # make sure lod of h heritted from prev_h + h = pd.fc(input=[prev_h, current_word], size=decoder_size, act='tanh') + state_cell.set_state('h', h) + + return state_cell + + +def decoder_train(state_cell): + # decoder + trg_language_word = pd.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = pd.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + decoder = TrainingDecoder(state_cell) + + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + decoder.state_cell.compute_state(inputs={'x': current_word}) + current_score = pd.fc(input=decoder.state_cell.get_state('h'), + size=target_dict_dim, + act='softmax') + decoder.state_cell.update_states() + decoder.output(current_score) + + return decoder() + + +def decoder_decode(state_cell): + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + def embedding(input): + return pd.embedding( + input=input, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr('vemb')) + + decoder = BeamSearchDecoder(state_cell, max_len=max_length) + + with decoder.block(): + prev_ids = decoder.read_array(init=init_ids, is_ids=True) + prev_scores = decoder.read_array(init=init_scores, is_scores=True) + prev_ids_embedding = embedding(prev_ids) + prev_state = decoder.state_cell.get_state('h') + prev_state_expanded = pd.sequence_expand(prev_state, prev_scores) + decoder.state_cell.set_state('h', prev_state_expanded) + decoder.state_cell.compute_state(inputs={'x': prev_ids_embedding}) + current_state = decoder.state_cell.get_state('h') + current_state_with_lod = pd.lod_reset(x=current_state, y=prev_scores) + # copy lod from prev_ids to current_state + scores = pd.fc(input=current_state_with_lod, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = pd.topk(scores, k=50) + selected_ids, selected_scores = pd.beam_search( + prev_ids, topk_indices, topk_scores, beam_size, end_id=1, level=0) + decoder.state_cell.update_states() + decoder.update_array(prev_ids, selected_ids) + decoder.update_array(prev_scores, selected_scores) + + translation_ids, translation_scores = decoder() + + return translation_ids, translation_scores + + +def set_init_lod(data, lod, place): + res = core.LoDTensor() + res.set(data, place) + res.set_lod(lod) + return res + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def train_main(): + context = encoder() + state_cell = decoder_state_cell(context) + rnn_out = decoder_train(state_cell) + label = pd.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer.minimize(avg_cost) + + train_data = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + exe = Executor(place) + + exe.run(framework.default_startup_program()) + + batch_id = 0 + for pass_id in xrange(1): + for data in train_data(): + word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) + outs = exe.run(framework.default_main_program(), + feed={ + 'src_word_id': word_data, + 'target_language_word': trg_word, + 'target_language_next_word': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3000: + break + batch_id += 1 + + +def decode_main(): + context = encoder() + state_cell = decoder_state_cell(context) + translation_ids, translation_scores = decoder_decode(state_cell) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [i for i in range(batch_size)] + [batch_size] + init_lod = [init_lod, init_lod] + + train_data = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + for _, data in enumerate(train_data()): + init_ids = set_init_lod(init_ids_data, init_lod, place) + init_scores = set_init_lod(init_scores_data, init_lod, place) + + src_word_data = to_lodtensor(map(lambda x: x[0], data), place) + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed={ + 'src_word_id': src_word_data, + 'init_ids': init_ids, + 'init_scores': init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + #break + + +if __name__ == '__main__': + #train_main() + decode_main()