From 6710927f00f8ee54a0dd882fe0c2478179f431d7 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 09:47:12 -0500 Subject: [PATCH 1/8] removed merging, dummy steps in favor of file prefixes --- confs/data.tconf | 54 +++++++++++--------------------------- confs/pipeline.tconf | 27 +++++-------------- sockeye.tape | 6 ----- tapes/bleu.tape | 8 +++--- tapes/download.tape | 41 ++++++++++++++++++++++++++--- tapes/dummy.tape | 30 --------------------- tapes/merge.tape | 17 ------------ tapes/postprocessing.tape | 4 +-- tapes/prepare_devtest.tape | 12 ++++----- tapes/sockeye.tape | 10 +++---- tapes/subword.tape | 24 ++++++++--------- tapes/tokenize.tape | 4 +-- tapes/truecase.tape | 11 +++++--- 13 files changed, 99 insertions(+), 149 deletions(-) delete mode 100644 tapes/dummy.tape delete mode 100644 tapes/merge.tape diff --git a/confs/data.tconf b/confs/data.tconf index 6e9b5dc..1f79577 100644 --- a/confs/data.tconf +++ b/confs/data.tconf @@ -1,43 +1,21 @@ global { - SRC=(TrainDataSource: - iwslt_deen_2014="de" - ) - TRG=(TrainDataSource: - iwslt_deen_2014="en" - ) - trg_lang=en # FIXME (only used by wrap_xml, under some rare cases) + SRC=de + TRG=en - train_data=(TrainDataSource: - iwslt_deen_2014=(side: - src="/path/to/iwslt/train.tags.nourl.de-en.de" - trg="/path/to/iwslt/train.tags.nourl.de-en.en" - ) - ) + # IWSLT + train_prefix="/home/hltcoe/mpost/code/tape4nmt/iwslt/train.tags.nourl.de-en" + dev_prefix="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.dev2010.de-en" + test_prefix=(TestSet: + iwslt10="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2010.de-en" + iwslt11="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2011.de-en" + iwslt12="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2011.de-en") - dev_data=(DevDataSource: - iwslt_deen_dev2010=(side: - src="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.de.xml" - trg="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.en.xml" - ) - iwslt_deen_dev2012=(side: - src="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.de.xml" - trg="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.en.xml" - ) - ) - - test_data=(TestDataSource: - iwslt_deen_test2010=(side: - src="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.de.xml" - trg="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.en.xml" - ) - iwslt_deen_test2011=(side: - src="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.de.xml" - trg="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.en.xml" - ) - iwslt_deen_test2012=(side: - src="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.de.xml" - trg="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.en.xml" - ) - ) + # WMT18 + # These are the file prefixes, to which $SRC and $TRG are appended. + # You can list any number of prefixes, which will be concatenated. + # You can also use SacreBLEU to generate data (it will call `--echo src|ref` depending on the side). + # train_prefix="/export/common/data/corpora/bitext/de-en/train/commoncrawl.de-en /export/common/data/corpora/bitext/de-en/train/europarl-v7.de-en /export/common/data/corpora/bitext/de-en/train/news-commentary-v13.de-en /export/common/data/corpora/bitext/raw/wmt17/rapid2016.de-en" + # dev_prefix="/home/hltcoe/mpost/data/bitext/de-en/test/newstest2016.de-en /home/hltcoe/mpost/data/bitext/de-en/test/newstest2017.de-en" + # test_prefix="sacrebleu://wmt18 en-de" } diff --git a/confs/pipeline.tconf b/confs/pipeline.tconf index 91b415e..0b61cab 100644 --- a/confs/pipeline.tconf +++ b/confs/pipeline.tconf @@ -1,11 +1,11 @@ global { dev_text=(SgmDev: - no=$out@download_or_link[DevtestDataSection:dev] + no=$out@download_or_link[DataSection:dev] yes=(side: src=$src_out@dev_text_from_sgm trg=$trg_out@dev_text_from_sgm) ) test_text=(SgmTest: - no=$out@download_or_link[DevtestDataSection:test] + no=$out@download_or_link[DataSection:test] yes=(side: src=$src_out@test_text_from_sgm trg=$trg_out@test_text_from_sgm) ) @@ -18,27 +18,14 @@ global { no=$out@train_truecaser[side:trg] ) - merged_data=(MergeTest: - no=(DataSection: - train=$out@merge[DataSection:train] - devtest=(DevtestDataSection: - dev=$out@merge[DevtestDataSection:dev] - test=$test_text - ) - ) - yes=$out@merge - ) + raw_data=$out@download_or_link + raw_data_test_src=$out@download_or_link[DataSection:test,side:src] + raw_data_test_trg=$out@download_or_link[DataSection:test,side:trg] tokenized_data=(DoTokenize: yes=$out@tokenize - no=(DataSection: - train=$out@dummy_aggregate_merge[DataSection:train] - devtest=(DevtestDataSection: - dev=$out@dummy_aggregate_merge[DevtestDataSection:dev] - test=$test_text - ) - ) - ) + no=$out@download_or_link + ) # don't do truecase when doing characterize # truecase will mess up the word boundary annotation diff --git a/sockeye.tape b/sockeye.tape index eb6e245..1b2b0e6 100644 --- a/sockeye.tape +++ b/sockeye.tape @@ -1,7 +1,6 @@ import "tapes/packages.tape" import "tapes/submitters.tape" import "tapes/versioners.tape" -import "tapes/dummy.tape" # ==== pipeline starts here ==== @@ -17,11 +16,6 @@ import "tapes/prepare_train.tape" # - extract dev/test from sgm format, if the wrapping exists import "tapes/prepare_devtest.tape" -# merge multiple train/dev/test sets -# note that merging of train/dev is mandatory, -# while test is controlled by the branch point `MergeTest` -import "tapes/merge.tape" - # tasks related to tokenize import "tapes/tokenize.tape" diff --git a/tapes/bleu.tape b/tapes/bleu.tape index ef6bc04..18ffae3 100644 --- a/tapes/bleu.tape +++ b/tapes/bleu.tape @@ -2,8 +2,8 @@ # as that will involve creating a wrap template for merged xml task nist_bleu : mosesdecoder < in=$out@wrap_xml - < wrap_template=$out@download_or_link[DevtestDataSection:test,side:src] - < ref=$out@dummy_aggregate_merge[DevtestDataSection:test,side:trg] + < wrap_template=$out@download_or_link[DataSection:test,side:src] + < ref=$tokenized_data[DataSection:test,side:trg] > bleu > bleu_c :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { @@ -14,7 +14,7 @@ task nist_bleu : mosesdecoder task multi_bleu : mosesdecoder < in=$detokenized_output - < ref=$out@dummy_aggregate_merge[DataSection:devtest,DevtestDataSection:test,side:trg] + < ref=$tokenized_data[DataSection:test,side:trg] > bleu > bleu_c :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { @@ -34,7 +34,7 @@ task multi_bleu : mosesdecoder task sacrebleu : sacrebleu < in=$detokenized_output - < ref=$out@dummy_aggregate_merge[DataSection:devtest,DevtestDataSection:test,side:trg] + < ref=$raw_data_test_trg > bleu > signature :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags diff --git a/tapes/download.tape b/tapes/download.tape index 1b91c92..dca3973 100644 --- a/tapes/download.tape +++ b/tapes/download.tape @@ -1,6 +1,41 @@ task download_or_link : tools - < in=(DataSection: train=$train_data devtest=(DevtestDataSection: dev=$dev_data test=$test_data)) + :: in=(DataSection: train=$train_prefix dev=$dev_prefix test=$test_prefix) + :: side=(side: src trg) + :: SRC=@ + :: TRG=@ + :: section=(DataSection: train dev test) + :: testing=(TestMode: no yes) > out - :: pyenv=@ { - python $tools/download_or_link.py $in $out + :: pyenv=@ { + + if [[ $side == "src" ]]; then + side="src" + lang=$SRC + elif [[ $side == "trg" ]]; then + side="ref" + lang=$TRG + fi + + if [[ $in == sacrebleu://* ]]; then + # format: "sacrebleu://test-set langpair" + + # crazy bash notation to remove prefix + args=(${in##*://}) + sacrebleu -t ${args[0]} -l ${args[1]} --echo $side > $out + else + for infile in $in; do + infile="$infile.$lang" + + if [[ $testing == "yes" ]]; then + if [[ $section == "train" ]]; then + numlines=10000 + else + numlines=100 + fi + head -n $numlines $infile >> $out + else + zcat -f $infile >> $out + fi + done + fi } diff --git a/tapes/dummy.tape b/tapes/dummy.tape deleted file mode 100644 index 6fb8e6b..0000000 --- a/tapes/dummy.tape +++ /dev/null @@ -1,30 +0,0 @@ -func dummy - < in - > out { - - ln -s $in $out -} - -task dummy_aggregate_dev_text calls dummy - < in=$dev_text - > out - -task dummy_aggregate_test_text calls dummy - < in=$test_text - > out - -task dummy_aggregate_merge calls dummy - < in=$merged_data - > out - -task dummy_aggregate_tokenize calls dummy - < in=$tokenized_data - > out - -task dummy_aggregate_truecase calls dummy - < in=$truecased_data - > out - -task dummy_aggregate_subword calls dummy - < in=$prepared_data - > out diff --git a/tapes/merge.tape b/tapes/merge.tape deleted file mode 100644 index 520915c..0000000 --- a/tapes/merge.tape +++ /dev/null @@ -1,17 +0,0 @@ -task merge - < in=(DataSection: - train=(side: - src=$src_out@train_sample[TrainDataSource:*] - trg=$trg_out@train_sample[TrainDataSource:*] - ) - devtest=(DevtestDataSection: - dev=$out@dummy_aggregate_dev_text[DevDataSource:*] - test=$out@dummy_aggregate_test_text[TestDataSource:*] - ) - ) - > out { - - for file in $in ; do - cat $file >> $out - done -} diff --git a/tapes/postprocessing.tape b/tapes/postprocessing.tape index 8c8a198..a147a6f 100644 --- a/tapes/postprocessing.tape +++ b/tapes/postprocessing.tape @@ -49,9 +49,9 @@ task decharacterize : tools # do not support merge for the moment task wrap_xml : mosesdecoder < in=$detokenized_output - < wrap_template=$out@download_or_link[DevtestDataSection:test,side:src] + < wrap_template=$out@download_or_link[DataSection:test,side:src] > out - :: trg_lang=@ + :: trg_lang=$TRG :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { $mosesdecoder/scripts/ems/support/wrap-xml.perl $trg_lang $wrap_template < $in > $out diff --git a/tapes/prepare_devtest.tape b/tapes/prepare_devtest.tape index abe7532..5b42bdf 100644 --- a/tapes/prepare_devtest.tape +++ b/tapes/prepare_devtest.tape @@ -1,7 +1,7 @@ task dev_text_from_sgm : mosesdecoder - < src_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:dev,side:src] - < trg_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:dev,side:trg] - < wrap_template=$out@download_or_link[DataSection:devtest,DevtestDataSection:dev,side:src] + < src_in=$out@download_or_link[DataSection:dev,side:src] + < trg_in=$out@download_or_link[DataSection:dev,side:trg] + < wrap_template=$out@download_or_link[DataSection:dev,side:src] > src_out > trg_out { @@ -19,9 +19,9 @@ task dev_text_from_sgm : mosesdecoder } task test_text_from_sgm : mosesdecoder - < src_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:test,side:src] - < trg_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:test,side:trg] - < wrap_template=$out@download_or_link[DataSection:devtest,DevtestDataSection:test,side:src] + < src_in=$out@download_or_link[DataSection:test,side:src] + < trg_in=$out@download_or_link[DataSection:test,side:trg] + < wrap_template=$out@download_or_link[DataSection:test,side:src] > src_out > trg_out { diff --git a/tapes/sockeye.tape b/tapes/sockeye.tape index 2ab9a35..b2060be 100644 --- a/tapes/sockeye.tape +++ b/tapes/sockeye.tape @@ -1,6 +1,6 @@ task prepare_data : sockeye - < train_src_in=$out@dummy_aggregate_subword[DataSection:train,side:src] - < train_trg_in=$out@dummy_aggregate_subword[DataSection:train,side:trg] + < train_src_in=$prepared_data[DataSection:train,side:src] + < train_trg_in=$prepared_data[DataSection:train,side:trg] > data :: pyenv=@ :: train_max_sent_length=$MaxLen @@ -23,8 +23,8 @@ task prepare_data : sockeye task train : sockeye < prepared_data=$data@prepare_data - < dev_src=$out@dummy_aggregate_subword[DataSection:devtest,DevtestDataSection:dev,side:src] - < dev_trg=$out@dummy_aggregate_subword[DataSection:devtest,DevtestDataSection:dev,side:trg] + < dev_src=$prepared_data[DataSection:dev,side:src] + < dev_trg=$prepared_data[DataSection:dev,side:trg] > model :: pyenv=@ :: train_batch_type=@ @@ -92,7 +92,7 @@ task train : sockeye # the target input here is used to compute naïve acc and ppl, # that's why we need post-bpe target input task decode : sockeye - < in=$out@dummy_aggregate_subword[DataSection:devtest,DevtestDataSection:test,side:src] + < in=$prepared_data[DataSection:test,side:src] < model=$model@train > out="out" > log="out.log" diff --git a/tapes/subword.tape b/tapes/subword.tape index f7453c8..a03ed9c 100644 --- a/tapes/subword.tape +++ b/tapes/subword.tape @@ -1,19 +1,19 @@ task train_bpe : subword_nmt - < src_in=$out@dummy_aggregate_truecase[DataSection:train,side:src] # FIXME - < trg_in=$out@dummy_aggregate_truecase[DataSection:train,side:trg] # FIXME - > model="bpe.model" - :: bpe_operations=@ - :: SRC=@ - :: TRG=@ - :: pyenv=@ - :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { + < src_in=$tokenized_data[DataSection:train,side:src] + < trg_in=$tokenized_data[DataSection:train,side:trg] + > model="bpe.model" + :: bpe_operations=@ + :: SRC=@ + :: TRG=@ + :: pyenv=@ + :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { subword-nmt learn-joint-bpe-and-vocab -i $src_in $trg_in -s $bpe_operations -o $model --write-vocabulary bpe.vocab.$SRC bpe.vocab.$TRG -v 2> log } task apply_bpe : subword_nmt # < in=$truecased_data - < in=$out@dummy_aggregate_truecase + < in=$tokenized_data < model=$model@train_bpe > out :: pyenv=@ { @@ -22,8 +22,8 @@ task apply_bpe : subword_nmt } task train_sentencepiece : sentencepiece - < src_in=$out@dummy_aggregate_merge[DataSection:train,side:src] - < trg_in=$out@dummy_aggregate_merge[DataSection:train,side:trg] + < src_in=$tokenized_data[DataSection:train,side:src] + < trg_in=$tokenized_data[DataSection:train,side:trg] > model="sp.model" > vocab="sp.vocab" :: sentencepiece_vocab_size=@ @@ -34,7 +34,7 @@ task train_sentencepiece : sentencepiece } task apply_sentencepiece : sentencepiece - < in=$out@dummy_aggregate_merge + < in=$tokenized_data < model=$model@train_sentencepiece > out :: pyenv=@ { diff --git a/tapes/tokenize.tape b/tapes/tokenize.tape index 1627a25..2f747dc 100644 --- a/tapes/tokenize.tape +++ b/tapes/tokenize.tape @@ -16,13 +16,13 @@ func tokenize : mosesdecoder tools # stanford_seg } task tokenize calls tokenize : mosesdecoder tools # stanford_seg - < in=$out@dummy_aggregate_merge + < in=$raw_data > out :: Lang=(side: src=$SRC trg=$TRG) :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags task characterize : tools - < in=$out@tokenize # FIXME fix test + < in=$out@tokenize > out :: pyenv=@ :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { diff --git a/tapes/truecase.tape b/tapes/truecase.tape index 98fb120..0d87a65 100644 --- a/tapes/truecase.tape +++ b/tapes/truecase.tape @@ -1,19 +1,22 @@ task train_truecaser : mosesdecoder - < in=$out@dummy_aggregate_tokenize[DataSection:train] # FIXME + < src_in=$prepared_data[DataSection:train,side:src] + < trg_in=$prepared_data[DataSection:train,side:trg] > out :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { + tmpfile=$(mktemp) + for in_file in $in ; do cat $in_file >> $PWD/tmp done - $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus $PWD/tmp -model $out + $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus $tmpfile -model $out - rm $PWD/tmp + rm -f $tmpfile } task truecase : mosesdecoder - < in=$out@dummy_aggregate_tokenize + < in=$tokenized_data < model=(side: src=$src_truecaser trg=$trg_truecaser) > out :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { From 9f976a3d2c15a3b34970eec4f15d2ebd0b600cd8 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 10:06:21 -0500 Subject: [PATCH 2/8] run script now takes tconf --- run | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/run b/run index 3af96e1..a3b6186 100755 --- a/run +++ b/run @@ -2,6 +2,8 @@ set -u +TAPEDIR=$(dirname $0) + TOOLKIT=${1:-} if [[ -z $TOOLKIT ]]; then @@ -12,11 +14,18 @@ if [[ -z $TOOLKIT ]]; then fi shift -TCONF=$TOOLKIT.tconf +TAPEFILE=$TAPEDIR/$TOOLKIT.tape +if [[ ! -e $TAPEFILE ]]; then + echo "Fatal: Couldn't find toolkit tape file $TAPEFILE" + exit 1 +fi + +TCONF=${1:-} if [[ ! -e $TCONF ]]; then - echo "Fatal: Couldn't find $TOOLKIT.tconf" + echo "Fatal: Couldn't find $TCONF" exit 1 fi +shift DUCTTAPE=$(which ducttape) if [[ $? -ne 0 ]]; then @@ -24,4 +33,4 @@ if [[ $? -ne 0 ]]; then exit 1 fi -${DUCTTAPE} ${TOOLKIT}.tape -C ${TCONF} $@ +${DUCTTAPE} ${TAPEFILE} -C ${TCONF} $@ From 45accd26345ebae63e74a0255e7a840b2cb82df2 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 10:07:42 -0500 Subject: [PATCH 3/8] minor change, moved submitter --- tapes/prepare_train.tape | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tapes/prepare_train.tape b/tapes/prepare_train.tape index 1e09c26..18bc1ee 100644 --- a/tapes/prepare_train.tape +++ b/tapes/prepare_train.tape @@ -3,11 +3,11 @@ task train_clean : mosesdecoder < trg_in=$out@download_or_link[DataSection:train,side:trg] > src_out > trg_out - :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags :: SRC=@ :: TRG=@ :: Ratio=@ - :: MaxLen=@ { + :: MaxLen=@ + :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags { mkdir -p tmp filename=$(basename $src_in) From 69eeb9e9e095447ca2e1e101155f6856270fa848 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 10:08:04 -0500 Subject: [PATCH 4/8] removed unused variables --- sockeye.tconf | 60 +++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/sockeye.tconf b/sockeye.tconf index 6173fb2..ff176d3 100644 --- a/sockeye.tconf +++ b/sockeye.tconf @@ -9,42 +9,12 @@ global { # All ducttape files will be written underneath this directory ducttape_output="out" - num_layers=(TestMode: no="6:6" yes="1:1") + num_layers=(TestMode: no=(NumLayers: 6_6="6:6" 10_2="10:2") yes="1:1") model_size=512 embed_size="512:512" - # all default is consistent with nematus - train_train_from="" # if there is a previous model to start with - train_train_from_state_dict="" # if there is a previous dict to start with - train_start_epoch="" # if trained for certain amount of epochs previously - train_batch_type=(TestMode: no="word" yes="sentence") - train_batch_size=(TestMode: no="80" yes=8) - train_optim="adam" - train_dropout=(Dropout: 0.1 0.3 0.5) - train_lr="0.001" - - # train_lr_min="1e-8" - train_lr_min="" - train_lr_shrink="0.5" - - # train_lr_scheduler="inverse_sqrt" - # train_warmup_init_lr="1e-07" - # train_warmup_updates="4000" - # train_criterion="label_smoothed_cross_entropy" - # train_label_smoothing="0.1" - train_lr_scheduler="" - train_warmup_init_lr="" - train_warmup_updates="" - train_criterion="" - train_label_smoothing="" - train_clip_norm=(ClipNorm: 0.0 0.1 0.5 1 5) - train_max_tokens="4000" - train_arch=(Architecture: conv="fconv" transformer="transformer" fconv_iwslt_de_en="fconv_iwslt_de_en" transformer_iwslt_de_en="transformer_iwslt_de_en") - train_share_input_output_embed="" - train_skip_invalid_size_inputs_valid_test="yes" - train_adam_beta1="0.9" - train_adam_beta2="0.999" + train_batch_size=(TestMode: no="4096" yes=8) # Sockeye train_checkpoint_freq=(TestMode: no=5000 yes=100) @@ -52,12 +22,10 @@ global { train_num_decode_and_eval=(TestMode: no=500 yes=10) # TEST CONFIGURATIONS - test_model_selection_strategy="acc" - test_max_sent_length="300" test_beam_size=(TestMode: no="12" yes="1") test_batch_size=1 - test_replace_unk="True" - test_remove_bpe="" + test_max_sent_length=100 + ################################################################################################## # Job submission parameters @@ -76,7 +44,7 @@ global { resource_flags_decode="-q gpu.q -l gpu=1,mem_free=4g" # SGE: flags for notifying about job completion (put in your email address!) - action_flags="-m ae -M YOUR_EMAIL_HERE" + action_flags="-m ae -M post@cs.jhu.edu" # The default submitter: shell (run locally) or sge (run on a grid) submitter=(TestMode: no="sge" yes="shell") @@ -99,8 +67,24 @@ global { bpe_operations=32000 # options for cleaning training data - MaxLen=80 + MaxLen=100 Ratio=1 use_cpu=(TestMode: no yes) } + +plan test { + reach sacrebleu via + (SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) * + (TestMode: yes) + + reach sacrebleu via + (SubwordMethod: bpe) * (DoTokenize: yes) * (DoTruecase: yes no) * + (TestMode: yes) +} + +plan transformer { + reach sacrebleu via + (SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) * + (NumLayers: 6_6 10_2) +} From e8110a3f245b11598e15710ae66f6b7d4a1a4802 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 10:08:34 -0500 Subject: [PATCH 5/8] removed plans from tape file (should be in tconf) --- fairseq.tape | 10 ---------- fairseq.tconf | 9 +++++++++ sockeye.tape | 9 --------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/fairseq.tape b/fairseq.tape index b649e23..931e5c4 100644 --- a/fairseq.tape +++ b/fairseq.tape @@ -46,16 +46,6 @@ import "tapes/bleu.tape" # ==== pipeline ends here ==== -plan test { - reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) * - (UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) * - (DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) * - (DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) * - (TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) * - (Architecture: fconv_iwslt_de_en) * (ClipNorm: 0.1) * (Dropout: 0.1) * (BpeMergeOps: 49500) * - (TestMode: no) -} - # Nuts and bolts: global { ducttape_experimental_packages=true diff --git a/fairseq.tconf b/fairseq.tconf index 5caf476..6c41a5e 100644 --- a/fairseq.tconf +++ b/fairseq.tconf @@ -55,3 +55,12 @@ global { use_cpu=(TestMode: no yes) } + +plan test { + reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) * + (UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) * + (DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) * + (DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) * + (TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) * + (Architecture: fconv_iwslt_de_en) * (ClipNorm: 0.1) * (Dropout: 0.1) * (BpeMergeOps: 49500) +} diff --git a/sockeye.tape b/sockeye.tape index 1b2b0e6..c5fdad7 100644 --- a/sockeye.tape +++ b/sockeye.tape @@ -40,15 +40,6 @@ import "tapes/bleu.tape" # ==== pipeline ends here ==== -plan test { - reach sacrebleu via - (SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) * - (TrainDataSource: iwslt_deen_2014) * - (SgmDev: yes) * (DevDataSource: iwslt_deen_dev2012) * - (SgmTest: yes) * (TestDataSource: iwslt_deen_test2012_small) * - (TestMode: yes) -} - # Nuts and bolts: global { ducttape_experimental_packages=true From e9bcd93854727d23699e850f222f8d95936bfc03 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 10:12:19 -0500 Subject: [PATCH 6/8] minor formatting --- tapes/packages.tape | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tapes/packages.tape b/tapes/packages.tape index 7b8a62e..ea50a1a 100644 --- a/tapes/packages.tape +++ b/tapes/packages.tape @@ -10,15 +10,12 @@ package sentencepiece :: .versioner=git .repo="https://github.com/google/sentenc make -j $(nproc) } -package tools - :: .versioner=git .repo="https://github.com/shuoyangd/tape4nmt-tools" .ref=HEAD { +package tools :: .versioner=git .repo="https://github.com/shuoyangd/tape4nmt-tools" .ref=HEAD { pip install -r requirements.txt } -# using my fork for now, as fairseq evolves pretty fast -package fairseq - :: .versioner=git .repo="https://github.com/shuoyangd/fairseq" .ref=HEAD { - +# using a fork for now, as fairseq evolves pretty fast +package fairseq :: .versioner=git .repo="https://github.com/shuoyangd/fairseq" .ref=HEAD { python setup.py build develop } From 0fe829239389406545d4cb4acb2209764ba8048f Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 10:32:30 -0500 Subject: [PATCH 7/8] moved global variables into tape files --- confs/pipeline.tconf | 60 --------------------------------------- sockeye.tconf | 1 - tapes/download.tape | 17 +++++++++++ tapes/postprocessing.tape | 22 ++++++++++++++ tapes/subword.tape | 8 ++++++ tapes/tokenize.tape | 7 +++++ tapes/truecase.tape | 18 ++++++++++++ 7 files changed, 72 insertions(+), 61 deletions(-) delete mode 100644 confs/pipeline.tconf diff --git a/confs/pipeline.tconf b/confs/pipeline.tconf deleted file mode 100644 index 0b61cab..0000000 --- a/confs/pipeline.tconf +++ /dev/null @@ -1,60 +0,0 @@ -global { - dev_text=(SgmDev: - no=$out@download_or_link[DataSection:dev] - yes=(side: src=$src_out@dev_text_from_sgm trg=$trg_out@dev_text_from_sgm) - ) - - test_text=(SgmTest: - no=$out@download_or_link[DataSection:test] - yes=(side: src=$src_out@test_text_from_sgm trg=$trg_out@test_text_from_sgm) - ) - - src_truecaser=(UseExistingTruecaser: - yes="" - no=$out@train_truecaser[side:src] - ) - trg_truecaser=(UseExistingTruecaser: - yes="" - no=$out@train_truecaser[side:trg] - ) - - raw_data=$out@download_or_link - raw_data_test_src=$out@download_or_link[DataSection:test,side:src] - raw_data_test_trg=$out@download_or_link[DataSection:test,side:trg] - - tokenized_data=(DoTokenize: - yes=$out@tokenize - no=$out@download_or_link - ) - - # don't do truecase when doing characterize - # truecase will mess up the word boundary annotation - truecased_data=(DoTruecase: - yes=$out@truecase - no=$tokenized_data - ) - - prepared_data=(SubwordMethod: - sentencepiece=$out@apply_sentencepiece - bpe=$out@apply_bpe - none=$truecased_data - ) - - debped_output=(SubwordMethod: - sentencepiece=$out@remove_sentencepiece - bpe=$out@debpe - none=$out@decode - ) - - # don't do truecase when doing characterize - # truecase will mess up the word boundary annotation - detruecased_output=(DoTruecase: - yes=$out@detruecase - no=$debped_output - ) - - detokenized_output=(DoTokenize: - yes=$out@detokenize - no=$detruecased_output - ) -} diff --git a/sockeye.tconf b/sockeye.tconf index ff176d3..26f82bc 100644 --- a/sockeye.tconf +++ b/sockeye.tconf @@ -1,5 +1,4 @@ import "confs/data.tconf" -import "confs/pipeline.tconf" global { ################################################################################################## diff --git a/tapes/download.tape b/tapes/download.tape index dca3973..370c0b8 100644 --- a/tapes/download.tape +++ b/tapes/download.tape @@ -1,3 +1,20 @@ +global { + + raw_data=$out@download_or_link + raw_data_test_src=$out@download_or_link[DataSection:test,side:src] + raw_data_test_trg=$out@download_or_link[DataSection:test,side:trg] + + dev_text=(SgmDev: + no=$out@download_or_link[DataSection:dev] + yes=(side: src=$src_out@dev_text_from_sgm trg=$trg_out@dev_text_from_sgm) + ) + + test_text=(SgmTest: + no=$out@download_or_link[DataSection:test] + yes=(side: src=$src_out@test_text_from_sgm trg=$trg_out@test_text_from_sgm) + ) +} + task download_or_link : tools :: in=(DataSection: train=$train_prefix dev=$dev_prefix test=$test_prefix) :: side=(side: src trg) diff --git a/tapes/postprocessing.tape b/tapes/postprocessing.tape index a147a6f..8dbe7c0 100644 --- a/tapes/postprocessing.tape +++ b/tapes/postprocessing.tape @@ -1,3 +1,25 @@ +global { + + debped_output=(SubwordMethod: + sentencepiece=$out@remove_sentencepiece + bpe=$out@debpe + none=$out@decode + ) + + # don't do truecase when doing characterize + # truecase will mess up the word boundary annotation + detruecased_output=(DoTruecase: + yes=$out@detruecase + no=$debped_output + ) + + detokenized_output=(DoTokenize: + yes=$out@detokenize + no=$detruecased_output + ) + +} + task debpe < in=$out@decode > out diff --git a/tapes/subword.tape b/tapes/subword.tape index a03ed9c..e673969 100644 --- a/tapes/subword.tape +++ b/tapes/subword.tape @@ -1,3 +1,11 @@ +global { + prepared_data=(SubwordMethod: + sentencepiece=$out@apply_sentencepiece + bpe=$out@apply_bpe + none=$truecased_data + ) +} + task train_bpe : subword_nmt < src_in=$tokenized_data[DataSection:train,side:src] < trg_in=$tokenized_data[DataSection:train,side:trg] diff --git a/tapes/tokenize.tape b/tapes/tokenize.tape index 2f747dc..35e9fdf 100644 --- a/tapes/tokenize.tape +++ b/tapes/tokenize.tape @@ -1,3 +1,10 @@ +global { + tokenized_data=(DoTokenize: + yes=$out@tokenize + no=$out@download_or_link + ) +} + func tokenize : mosesdecoder tools # stanford_seg < in > out diff --git a/tapes/truecase.tape b/tapes/truecase.tape index 0d87a65..37e6873 100644 --- a/tapes/truecase.tape +++ b/tapes/truecase.tape @@ -1,3 +1,21 @@ +global { + src_truecaser=(UseExistingTruecaser: + yes="" + no=$out@train_truecaser[side:src] + ) + trg_truecaser=(UseExistingTruecaser: + yes="" + no=$out@train_truecaser[side:trg] + ) + + # don't do truecase when doing characterize + # truecase will mess up the word boundary annotation + truecased_data=(DoTruecase: + yes=$out@truecase + no=$tokenized_data + ) +} + task train_truecaser : mosesdecoder < src_in=$prepared_data[DataSection:train,side:src] < trg_in=$prepared_data[DataSection:train,side:trg] From 6334bc699af8f06530a319bc769a25bd88f27ac9 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 14 Dec 2018 13:35:59 -0500 Subject: [PATCH 8/8] fix --- sockeye.tconf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sockeye.tconf b/sockeye.tconf index 26f82bc..2ac276c 100644 --- a/sockeye.tconf +++ b/sockeye.tconf @@ -43,7 +43,7 @@ global { resource_flags_decode="-q gpu.q -l gpu=1,mem_free=4g" # SGE: flags for notifying about job completion (put in your email address!) - action_flags="-m ae -M post@cs.jhu.edu" + action_flags="-m ae -M YOUR@EMAIL.ADDRESS" # The default submitter: shell (run locally) or sge (run on a grid) submitter=(TestMode: no="sge" yes="shell")