From 6710927f00f8ee54a0dd882fe0c2478179f431d7 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 09:47:12 -0500
Subject: [PATCH 1/8] removed merging, dummy steps in favor of file prefixes

---
 confs/data.tconf           | 54 +++++++++++---------------------------
 confs/pipeline.tconf       | 27 +++++--------------
 sockeye.tape               |  6 -----
 tapes/bleu.tape            |  8 +++---
 tapes/download.tape        | 41 ++++++++++++++++++++++++++---
 tapes/dummy.tape           | 30 ---------------------
 tapes/merge.tape           | 17 ------------
 tapes/postprocessing.tape  |  4 +--
 tapes/prepare_devtest.tape | 12 ++++-----
 tapes/sockeye.tape         | 10 +++----
 tapes/subword.tape         | 24 ++++++++---------
 tapes/tokenize.tape        |  4 +--
 tapes/truecase.tape        | 11 +++++---
 13 files changed, 99 insertions(+), 149 deletions(-)
 delete mode 100644 tapes/dummy.tape
 delete mode 100644 tapes/merge.tape

diff --git a/confs/data.tconf b/confs/data.tconf
index 6e9b5dc..1f79577 100644
--- a/confs/data.tconf
+++ b/confs/data.tconf
@@ -1,43 +1,21 @@
 global {
 
-  SRC=(TrainDataSource:
-    iwslt_deen_2014="de"
-  )
-  TRG=(TrainDataSource:
-    iwslt_deen_2014="en"
-  )
-  trg_lang=en  # FIXME (only used by wrap_xml, under some rare cases)
+  SRC=de
+  TRG=en
 
-  train_data=(TrainDataSource:
-    iwslt_deen_2014=(side:
-      src="/path/to/iwslt/train.tags.nourl.de-en.de"
-      trg="/path/to/iwslt/train.tags.nourl.de-en.en"
-    )
-  )
+  # IWSLT
+  train_prefix="/home/hltcoe/mpost/code/tape4nmt/iwslt/train.tags.nourl.de-en"
+  dev_prefix="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.dev2010.de-en"
+  test_prefix=(TestSet: 
+      iwslt10="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2010.de-en" 
+      iwslt11="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2011.de-en"
+      iwslt12="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2011.de-en")
 
-  dev_data=(DevDataSource:
-    iwslt_deen_dev2010=(side:
-      src="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.de.xml"
-      trg="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.en.xml"
-    )
-    iwslt_deen_dev2012=(side:
-      src="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.de.xml"
-      trg="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.en.xml"
-    )
-  )
-
-  test_data=(TestDataSource:
-    iwslt_deen_test2010=(side:
-      src="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.de.xml"
-      trg="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.en.xml"
-    )
-    iwslt_deen_test2011=(side:
-      src="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.de.xml"
-      trg="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.en.xml"
-    )
-    iwslt_deen_test2012=(side:
-      src="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.de.xml"
-      trg="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.en.xml"
-    )
-  )
+  # WMT18
+  # These are the file prefixes, to which $SRC and $TRG are appended. 
+  # You can list any number of prefixes, which will be concatenated.
+  # You can also use SacreBLEU to generate data (it will call `--echo src|ref` depending on the side).
+  # train_prefix="/export/common/data/corpora/bitext/de-en/train/commoncrawl.de-en /export/common/data/corpora/bitext/de-en/train/europarl-v7.de-en /export/common/data/corpora/bitext/de-en/train/news-commentary-v13.de-en /export/common/data/corpora/bitext/raw/wmt17/rapid2016.de-en"
+  # dev_prefix="/home/hltcoe/mpost/data/bitext/de-en/test/newstest2016.de-en /home/hltcoe/mpost/data/bitext/de-en/test/newstest2017.de-en"
+  # test_prefix="sacrebleu://wmt18 en-de"
 }
diff --git a/confs/pipeline.tconf b/confs/pipeline.tconf
index 91b415e..0b61cab 100644
--- a/confs/pipeline.tconf
+++ b/confs/pipeline.tconf
@@ -1,11 +1,11 @@
 global {
   dev_text=(SgmDev:
-    no=$out@download_or_link[DevtestDataSection:dev]
+    no=$out@download_or_link[DataSection:dev]
     yes=(side: src=$src_out@dev_text_from_sgm trg=$trg_out@dev_text_from_sgm)
   )
 
   test_text=(SgmTest:
-    no=$out@download_or_link[DevtestDataSection:test]
+    no=$out@download_or_link[DataSection:test]
     yes=(side: src=$src_out@test_text_from_sgm trg=$trg_out@test_text_from_sgm)
   )
 
@@ -18,27 +18,14 @@ global {
     no=$out@train_truecaser[side:trg]
   )
 
-  merged_data=(MergeTest:
-    no=(DataSection:
-      train=$out@merge[DataSection:train]
-      devtest=(DevtestDataSection:
-        dev=$out@merge[DevtestDataSection:dev]
-        test=$test_text
-      )
-    )
-    yes=$out@merge
-  )
+  raw_data=$out@download_or_link
+  raw_data_test_src=$out@download_or_link[DataSection:test,side:src]
+  raw_data_test_trg=$out@download_or_link[DataSection:test,side:trg]
 
   tokenized_data=(DoTokenize:
     yes=$out@tokenize
-    no=(DataSection:
-      train=$out@dummy_aggregate_merge[DataSection:train]
-      devtest=(DevtestDataSection:
-        dev=$out@dummy_aggregate_merge[DevtestDataSection:dev]
-        test=$test_text
-      )
-    )
-  )
+    no=$out@download_or_link
+   )
 
   # don't do truecase when doing characterize
   # truecase will mess up the word boundary annotation
diff --git a/sockeye.tape b/sockeye.tape
index eb6e245..1b2b0e6 100644
--- a/sockeye.tape
+++ b/sockeye.tape
@@ -1,7 +1,6 @@
 import "tapes/packages.tape"
 import "tapes/submitters.tape"
 import "tapes/versioners.tape"
-import "tapes/dummy.tape"
 
 # ==== pipeline starts here ====
 
@@ -17,11 +16,6 @@ import "tapes/prepare_train.tape"
 # - extract dev/test from sgm format, if the wrapping exists
 import "tapes/prepare_devtest.tape"
 
-# merge multiple train/dev/test sets
-# note that merging of train/dev is mandatory,
-# while test is controlled by the branch point `MergeTest`
-import "tapes/merge.tape"
-
 # tasks related to tokenize
 import "tapes/tokenize.tape"
 
diff --git a/tapes/bleu.tape b/tapes/bleu.tape
index ef6bc04..18ffae3 100644
--- a/tapes/bleu.tape
+++ b/tapes/bleu.tape
@@ -2,8 +2,8 @@
 # as that will involve creating a wrap template for merged xml
 task nist_bleu : mosesdecoder
     < in=$out@wrap_xml
-    < wrap_template=$out@download_or_link[DevtestDataSection:test,side:src]
-    < ref=$out@dummy_aggregate_merge[DevtestDataSection:test,side:trg]
+    < wrap_template=$out@download_or_link[DataSection:test,side:src]
+    < ref=$tokenized_data[DataSection:test,side:trg]
     > bleu
     > bleu_c
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
@@ -14,7 +14,7 @@ task nist_bleu : mosesdecoder
 
 task multi_bleu : mosesdecoder
     < in=$detokenized_output
-    < ref=$out@dummy_aggregate_merge[DataSection:devtest,DevtestDataSection:test,side:trg]
+    < ref=$tokenized_data[DataSection:test,side:trg]
     > bleu
     > bleu_c
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
@@ -34,7 +34,7 @@ task multi_bleu : mosesdecoder
 
 task sacrebleu : sacrebleu
     < in=$detokenized_output
-    < ref=$out@dummy_aggregate_merge[DataSection:devtest,DevtestDataSection:test,side:trg]
+    < ref=$raw_data_test_trg
     > bleu
     > signature
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags
diff --git a/tapes/download.tape b/tapes/download.tape
index 1b91c92..dca3973 100644
--- a/tapes/download.tape
+++ b/tapes/download.tape
@@ -1,6 +1,41 @@
 task download_or_link : tools
-    < in=(DataSection: train=$train_data devtest=(DevtestDataSection: dev=$dev_data test=$test_data))
+    :: in=(DataSection: train=$train_prefix dev=$dev_prefix test=$test_prefix)
+    :: side=(side: src trg)
+    :: SRC=@
+    :: TRG=@
+    :: section=(DataSection: train dev test)
+    :: testing=(TestMode: no yes)
     > out
-    :: pyenv=@ {
-  python $tools/download_or_link.py $in $out
+   :: pyenv=@ {
+
+  if [[ $side == "src" ]]; then
+    side="src"
+    lang=$SRC
+  elif [[ $side == "trg" ]]; then
+    side="ref"
+    lang=$TRG
+  fi
+
+  if [[ $in == sacrebleu://* ]]; then
+    # format: "sacrebleu://test-set langpair"
+
+    # crazy bash notation to remove prefix
+    args=(${in##*://})
+    sacrebleu -t ${args[0]} -l ${args[1]} --echo $side > $out
+  else
+    for infile in $in; do
+      infile="$infile.$lang"
+
+      if [[ $testing == "yes" ]]; then
+        if [[ $section == "train" ]]; then
+          numlines=10000
+        else
+          numlines=100
+        fi
+        head -n $numlines $infile >> $out
+      else
+        zcat -f $infile >> $out
+      fi
+    done
+  fi
 }
diff --git a/tapes/dummy.tape b/tapes/dummy.tape
deleted file mode 100644
index 6fb8e6b..0000000
--- a/tapes/dummy.tape
+++ /dev/null
@@ -1,30 +0,0 @@
-func dummy
-    < in
-    > out {
-    
-  ln -s $in $out
-}
-
-task dummy_aggregate_dev_text calls dummy
-    < in=$dev_text
-    > out
-
-task dummy_aggregate_test_text calls dummy
-    < in=$test_text
-    > out
-
-task dummy_aggregate_merge calls dummy
-    < in=$merged_data
-    > out
-
-task dummy_aggregate_tokenize calls dummy
-    < in=$tokenized_data
-    > out
-
-task dummy_aggregate_truecase calls dummy
-    < in=$truecased_data
-    > out
-
-task dummy_aggregate_subword calls dummy
-    < in=$prepared_data
-    > out
diff --git a/tapes/merge.tape b/tapes/merge.tape
deleted file mode 100644
index 520915c..0000000
--- a/tapes/merge.tape
+++ /dev/null
@@ -1,17 +0,0 @@
-task merge
-    < in=(DataSection:
-            train=(side:
-              src=$src_out@train_sample[TrainDataSource:*]
-              trg=$trg_out@train_sample[TrainDataSource:*]
-            )
-            devtest=(DevtestDataSection:
-              dev=$out@dummy_aggregate_dev_text[DevDataSource:*]
-              test=$out@dummy_aggregate_test_text[TestDataSource:*]
-            )
-         )
-    > out {
-
-  for file in $in ; do
-    cat $file >> $out
-  done
-}
diff --git a/tapes/postprocessing.tape b/tapes/postprocessing.tape
index 8c8a198..a147a6f 100644
--- a/tapes/postprocessing.tape
+++ b/tapes/postprocessing.tape
@@ -49,9 +49,9 @@ task decharacterize : tools
 # do not support merge for the moment
 task wrap_xml : mosesdecoder
     < in=$detokenized_output
-    < wrap_template=$out@download_or_link[DevtestDataSection:test,side:src]
+    < wrap_template=$out@download_or_link[DataSection:test,side:src]
     > out
-    :: trg_lang=@
+    :: trg_lang=$TRG
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
 
   $mosesdecoder/scripts/ems/support/wrap-xml.perl $trg_lang $wrap_template < $in > $out
diff --git a/tapes/prepare_devtest.tape b/tapes/prepare_devtest.tape
index abe7532..5b42bdf 100644
--- a/tapes/prepare_devtest.tape
+++ b/tapes/prepare_devtest.tape
@@ -1,7 +1,7 @@
 task dev_text_from_sgm : mosesdecoder
-    < src_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:dev,side:src]
-    < trg_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:dev,side:trg]
-    < wrap_template=$out@download_or_link[DataSection:devtest,DevtestDataSection:dev,side:src]
+    < src_in=$out@download_or_link[DataSection:dev,side:src]
+    < trg_in=$out@download_or_link[DataSection:dev,side:trg]
+    < wrap_template=$out@download_or_link[DataSection:dev,side:src]
     > src_out
     > trg_out {
 
@@ -19,9 +19,9 @@ task dev_text_from_sgm : mosesdecoder
 }
 
 task test_text_from_sgm : mosesdecoder
-    < src_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:test,side:src]
-    < trg_in=$out@download_or_link[DataSection:devtest,DevtestDataSection:test,side:trg]
-    < wrap_template=$out@download_or_link[DataSection:devtest,DevtestDataSection:test,side:src]
+    < src_in=$out@download_or_link[DataSection:test,side:src]
+    < trg_in=$out@download_or_link[DataSection:test,side:trg]
+    < wrap_template=$out@download_or_link[DataSection:test,side:src]
     > src_out
     > trg_out {
 
diff --git a/tapes/sockeye.tape b/tapes/sockeye.tape
index 2ab9a35..b2060be 100644
--- a/tapes/sockeye.tape
+++ b/tapes/sockeye.tape
@@ -1,6 +1,6 @@
 task prepare_data : sockeye
-    < train_src_in=$out@dummy_aggregate_subword[DataSection:train,side:src]
-    < train_trg_in=$out@dummy_aggregate_subword[DataSection:train,side:trg]
+    < train_src_in=$prepared_data[DataSection:train,side:src]
+    < train_trg_in=$prepared_data[DataSection:train,side:trg]
     > data
     :: pyenv=@
     :: train_max_sent_length=$MaxLen
@@ -23,8 +23,8 @@ task prepare_data : sockeye
 
 task train : sockeye
     < prepared_data=$data@prepare_data
-    < dev_src=$out@dummy_aggregate_subword[DataSection:devtest,DevtestDataSection:dev,side:src]
-    < dev_trg=$out@dummy_aggregate_subword[DataSection:devtest,DevtestDataSection:dev,side:trg]
+    < dev_src=$prepared_data[DataSection:dev,side:src]
+    < dev_trg=$prepared_data[DataSection:dev,side:trg]
     > model
     :: pyenv=@
     :: train_batch_type=@
@@ -92,7 +92,7 @@ task train : sockeye
 # the target input here is used to compute naïve acc and ppl,
 # that's why we need post-bpe target input
 task decode : sockeye
-    < in=$out@dummy_aggregate_subword[DataSection:devtest,DevtestDataSection:test,side:src]
+    < in=$prepared_data[DataSection:test,side:src]
     < model=$model@train
     > out="out"
     > log="out.log"
diff --git a/tapes/subword.tape b/tapes/subword.tape
index f7453c8..a03ed9c 100644
--- a/tapes/subword.tape
+++ b/tapes/subword.tape
@@ -1,19 +1,19 @@
 task train_bpe : subword_nmt
-      < src_in=$out@dummy_aggregate_truecase[DataSection:train,side:src]  # FIXME
-      < trg_in=$out@dummy_aggregate_truecase[DataSection:train,side:trg]  # FIXME
-      > model="bpe.model"
-      :: bpe_operations=@
-      :: SRC=@
-      :: TRG=@
-      :: pyenv=@
-      :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
+  < src_in=$tokenized_data[DataSection:train,side:src]
+  < trg_in=$tokenized_data[DataSection:train,side:trg]
+  > model="bpe.model"
+  :: bpe_operations=@
+  :: SRC=@
+  :: TRG=@
+  :: pyenv=@
+  :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
 
   subword-nmt learn-joint-bpe-and-vocab -i $src_in $trg_in -s $bpe_operations -o $model --write-vocabulary bpe.vocab.$SRC bpe.vocab.$TRG -v 2> log
 }
 
 task apply_bpe : subword_nmt
     # < in=$truecased_data
-    < in=$out@dummy_aggregate_truecase
+    < in=$tokenized_data
     < model=$model@train_bpe
     > out
     :: pyenv=@ {
@@ -22,8 +22,8 @@ task apply_bpe : subword_nmt
 }
 
 task train_sentencepiece : sentencepiece
-  < src_in=$out@dummy_aggregate_merge[DataSection:train,side:src]
-  < trg_in=$out@dummy_aggregate_merge[DataSection:train,side:trg]
+  < src_in=$tokenized_data[DataSection:train,side:src]
+  < trg_in=$tokenized_data[DataSection:train,side:trg]
   > model="sp.model"
   > vocab="sp.vocab"
   :: sentencepiece_vocab_size=@
@@ -34,7 +34,7 @@ task train_sentencepiece : sentencepiece
 }
 
 task apply_sentencepiece : sentencepiece
-  < in=$out@dummy_aggregate_merge
+  < in=$tokenized_data
   < model=$model@train_sentencepiece
   > out 
   :: pyenv=@ {
diff --git a/tapes/tokenize.tape b/tapes/tokenize.tape
index 1627a25..2f747dc 100644
--- a/tapes/tokenize.tape
+++ b/tapes/tokenize.tape
@@ -16,13 +16,13 @@ func tokenize : mosesdecoder tools # stanford_seg
 }
 
 task tokenize calls tokenize : mosesdecoder tools # stanford_seg
-    < in=$out@dummy_aggregate_merge
+    < in=$raw_data
     > out
     :: Lang=(side: src=$SRC trg=$TRG)
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags
 
 task characterize : tools
-    < in=$out@tokenize  # FIXME fix test
+    < in=$out@tokenize
     > out
     :: pyenv=@
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
diff --git a/tapes/truecase.tape b/tapes/truecase.tape
index 98fb120..0d87a65 100644
--- a/tapes/truecase.tape
+++ b/tapes/truecase.tape
@@ -1,19 +1,22 @@
 task train_truecaser : mosesdecoder
-    < in=$out@dummy_aggregate_tokenize[DataSection:train]  # FIXME
+    < src_in=$prepared_data[DataSection:train,side:src]
+    < trg_in=$prepared_data[DataSection:train,side:trg]
     > out
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
 
+  tmpfile=$(mktemp)
+
   for in_file in $in ; do
     cat $in_file >> $PWD/tmp
   done
 
-  $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus $PWD/tmp -model $out
+  $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus $tmpfile -model $out
 
-  rm $PWD/tmp
+  rm -f $tmpfile
 }
 
 task truecase : mosesdecoder
-    < in=$out@dummy_aggregate_tokenize
+    < in=$tokenized_data
     < model=(side: src=$src_truecaser trg=$trg_truecaser)
     > out
     :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {

From 9f976a3d2c15a3b34970eec4f15d2ebd0b600cd8 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 10:06:21 -0500
Subject: [PATCH 2/8] run script now takes tconf

---
 run | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/run b/run
index 3af96e1..a3b6186 100755
--- a/run
+++ b/run
@@ -2,6 +2,8 @@
 
 set -u
 
+TAPEDIR=$(dirname $0)
+
 TOOLKIT=${1:-}
 
 if [[ -z $TOOLKIT ]]; then
@@ -12,11 +14,18 @@ if [[ -z $TOOLKIT ]]; then
 fi
 shift
 
-TCONF=$TOOLKIT.tconf
+TAPEFILE=$TAPEDIR/$TOOLKIT.tape
+if [[ ! -e $TAPEFILE ]]; then
+    echo "Fatal: Couldn't find toolkit tape file $TAPEFILE"
+    exit 1
+fi
+
+TCONF=${1:-}
 if [[ ! -e $TCONF ]]; then
-    echo "Fatal: Couldn't find $TOOLKIT.tconf"
+    echo "Fatal: Couldn't find $TCONF"
     exit 1
 fi
+shift
 
 DUCTTAPE=$(which ducttape)
 if [[ $? -ne 0 ]]; then
@@ -24,4 +33,4 @@ if [[ $? -ne 0 ]]; then
    exit 1
 fi
 
-${DUCTTAPE} ${TOOLKIT}.tape -C ${TCONF} $@
+${DUCTTAPE} ${TAPEFILE} -C ${TCONF} $@

From 45accd26345ebae63e74a0255e7a840b2cb82df2 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 10:07:42 -0500
Subject: [PATCH 3/8] minor change, moved submitter

---
 tapes/prepare_train.tape | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tapes/prepare_train.tape b/tapes/prepare_train.tape
index 1e09c26..18bc1ee 100644
--- a/tapes/prepare_train.tape
+++ b/tapes/prepare_train.tape
@@ -3,11 +3,11 @@ task train_clean : mosesdecoder
     < trg_in=$out@download_or_link[DataSection:train,side:trg]
     > src_out
     > trg_out
-    :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags
     :: SRC=@
     :: TRG=@
     :: Ratio=@
-    :: MaxLen=@ {
+    :: MaxLen=@
+    :: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
 
   mkdir -p tmp
   filename=$(basename $src_in)

From 69eeb9e9e095447ca2e1e101155f6856270fa848 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 10:08:04 -0500
Subject: [PATCH 4/8] removed unused variables

---
 sockeye.tconf | 60 +++++++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/sockeye.tconf b/sockeye.tconf
index 6173fb2..ff176d3 100644
--- a/sockeye.tconf
+++ b/sockeye.tconf
@@ -9,42 +9,12 @@ global {
   # All ducttape files will be written underneath this directory
   ducttape_output="out"
 
-  num_layers=(TestMode: no="6:6" yes="1:1")
+  num_layers=(TestMode: no=(NumLayers: 6_6="6:6" 10_2="10:2") yes="1:1")
   model_size=512
   embed_size="512:512"
 
-  # all default is consistent with nematus
-  train_train_from="" # if there is a previous model to start with
-  train_train_from_state_dict="" # if there is a previous dict to start with
-  train_start_epoch="" # if trained for certain amount of epochs previously
-
   train_batch_type=(TestMode: no="word" yes="sentence")
-  train_batch_size=(TestMode: no="80" yes=8)
-  train_optim="adam"
-  train_dropout=(Dropout: 0.1 0.3 0.5)
-  train_lr="0.001"
-
-  # train_lr_min="1e-8"
-  train_lr_min=""
-  train_lr_shrink="0.5"
-
-  # train_lr_scheduler="inverse_sqrt"
-  # train_warmup_init_lr="1e-07"
-  # train_warmup_updates="4000"
-  # train_criterion="label_smoothed_cross_entropy"
-  # train_label_smoothing="0.1"
-  train_lr_scheduler=""
-  train_warmup_init_lr=""
-  train_warmup_updates=""
-  train_criterion=""
-  train_label_smoothing=""
-  train_clip_norm=(ClipNorm: 0.0 0.1 0.5 1 5)
-  train_max_tokens="4000"
-  train_arch=(Architecture: conv="fconv" transformer="transformer" fconv_iwslt_de_en="fconv_iwslt_de_en" transformer_iwslt_de_en="transformer_iwslt_de_en")
-  train_share_input_output_embed=""
-  train_skip_invalid_size_inputs_valid_test="yes"
-  train_adam_beta1="0.9"
-  train_adam_beta2="0.999"
+  train_batch_size=(TestMode: no="4096" yes=8)
 
   # Sockeye
   train_checkpoint_freq=(TestMode: no=5000 yes=100)
@@ -52,12 +22,10 @@ global {
   train_num_decode_and_eval=(TestMode: no=500 yes=10)
 
   # TEST CONFIGURATIONS
-  test_model_selection_strategy="acc"
-  test_max_sent_length="300"
   test_beam_size=(TestMode: no="12" yes="1")
   test_batch_size=1
-  test_replace_unk="True"
-  test_remove_bpe=""
+  test_max_sent_length=100
+
 
   ##################################################################################################
   # Job submission parameters
@@ -76,7 +44,7 @@ global {
   resource_flags_decode="-q gpu.q -l gpu=1,mem_free=4g"
 
   # SGE: flags for notifying about job completion (put in your email address!)
-  action_flags="-m ae -M YOUR_EMAIL_HERE"
+  action_flags="-m ae -M post@cs.jhu.edu"
 
   # The default submitter: shell (run locally) or sge (run on a grid)
   submitter=(TestMode: no="sge" yes="shell")
@@ -99,8 +67,24 @@ global {
   bpe_operations=32000
 
   # options for cleaning training data
-  MaxLen=80
+  MaxLen=100
   Ratio=1
 
   use_cpu=(TestMode: no yes)
 }
+
+plan test {
+  reach sacrebleu via 
+    (SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) * 
+    (TestMode: yes)
+
+  reach sacrebleu via 
+    (SubwordMethod: bpe) * (DoTokenize: yes) * (DoTruecase: yes no) * 
+    (TestMode: yes)
+}
+
+plan transformer {
+  reach sacrebleu via
+    (SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) *
+    (NumLayers: 6_6 10_2)
+}

From e8110a3f245b11598e15710ae66f6b7d4a1a4802 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 10:08:34 -0500
Subject: [PATCH 5/8] removed plans from tape file (should be in tconf)

---
 fairseq.tape  | 10 ----------
 fairseq.tconf |  9 +++++++++
 sockeye.tape  |  9 ---------
 3 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/fairseq.tape b/fairseq.tape
index b649e23..931e5c4 100644
--- a/fairseq.tape
+++ b/fairseq.tape
@@ -46,16 +46,6 @@ import "tapes/bleu.tape"
 
 # ==== pipeline ends here ====
 
-plan test {
-  reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) * 
-    (UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) * 
-    (DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) * 
-    (DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) * 
-    (TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) * 
-    (Architecture: fconv_iwslt_de_en) * (ClipNorm: 0.1) * (Dropout: 0.1) * (BpeMergeOps: 49500) *
-    (TestMode: no)
-}
-
 # Nuts and bolts:
 global {
   ducttape_experimental_packages=true
diff --git a/fairseq.tconf b/fairseq.tconf
index 5caf476..6c41a5e 100644
--- a/fairseq.tconf
+++ b/fairseq.tconf
@@ -55,3 +55,12 @@ global {
 
   use_cpu=(TestMode: no yes)
 }
+
+plan test {
+  reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) * 
+    (UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) * 
+    (DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) * 
+    (DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) * 
+    (TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) * 
+    (Architecture: fconv_iwslt_de_en) * (ClipNorm: 0.1) * (Dropout: 0.1) * (BpeMergeOps: 49500)
+}
diff --git a/sockeye.tape b/sockeye.tape
index 1b2b0e6..c5fdad7 100644
--- a/sockeye.tape
+++ b/sockeye.tape
@@ -40,15 +40,6 @@ import "tapes/bleu.tape"
 
 # ==== pipeline ends here ====
 
-plan test {
-  reach sacrebleu via 
-    (SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) * 
-    (TrainDataSource: iwslt_deen_2014) * 
-    (SgmDev: yes) * (DevDataSource: iwslt_deen_dev2012) * 
-    (SgmTest: yes) * (TestDataSource: iwslt_deen_test2012_small) * 
-    (TestMode: yes)
-}
-
 # Nuts and bolts:
 global {
   ducttape_experimental_packages=true

From e9bcd93854727d23699e850f222f8d95936bfc03 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 10:12:19 -0500
Subject: [PATCH 6/8] minor formatting

---
 tapes/packages.tape | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tapes/packages.tape b/tapes/packages.tape
index 7b8a62e..ea50a1a 100644
--- a/tapes/packages.tape
+++ b/tapes/packages.tape
@@ -10,15 +10,12 @@ package sentencepiece :: .versioner=git .repo="https://github.com/google/sentenc
   make -j $(nproc)
 }
 
-package tools
-    :: .versioner=git .repo="https://github.com/shuoyangd/tape4nmt-tools" .ref=HEAD {
+package tools :: .versioner=git .repo="https://github.com/shuoyangd/tape4nmt-tools" .ref=HEAD {
   pip install -r requirements.txt
 }
 
-# using my fork for now, as fairseq evolves pretty fast
-package fairseq
-    :: .versioner=git .repo="https://github.com/shuoyangd/fairseq" .ref=HEAD {
-
+# using a fork for now, as fairseq evolves pretty fast
+package fairseq :: .versioner=git .repo="https://github.com/shuoyangd/fairseq" .ref=HEAD {
   python setup.py build develop
 }
 

From 0fe829239389406545d4cb4acb2209764ba8048f Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 10:32:30 -0500
Subject: [PATCH 7/8] moved global variables into tape files

---
 confs/pipeline.tconf      | 60 ---------------------------------------
 sockeye.tconf             |  1 -
 tapes/download.tape       | 17 +++++++++++
 tapes/postprocessing.tape | 22 ++++++++++++++
 tapes/subword.tape        |  8 ++++++
 tapes/tokenize.tape       |  7 +++++
 tapes/truecase.tape       | 18 ++++++++++++
 7 files changed, 72 insertions(+), 61 deletions(-)
 delete mode 100644 confs/pipeline.tconf

diff --git a/confs/pipeline.tconf b/confs/pipeline.tconf
deleted file mode 100644
index 0b61cab..0000000
--- a/confs/pipeline.tconf
+++ /dev/null
@@ -1,60 +0,0 @@
-global {
-  dev_text=(SgmDev:
-    no=$out@download_or_link[DataSection:dev]
-    yes=(side: src=$src_out@dev_text_from_sgm trg=$trg_out@dev_text_from_sgm)
-  )
-
-  test_text=(SgmTest:
-    no=$out@download_or_link[DataSection:test]
-    yes=(side: src=$src_out@test_text_from_sgm trg=$trg_out@test_text_from_sgm)
-  )
-
-  src_truecaser=(UseExistingTruecaser:
-    yes=""
-    no=$out@train_truecaser[side:src]
-  )
-  trg_truecaser=(UseExistingTruecaser:
-    yes=""
-    no=$out@train_truecaser[side:trg]
-  )
-
-  raw_data=$out@download_or_link
-  raw_data_test_src=$out@download_or_link[DataSection:test,side:src]
-  raw_data_test_trg=$out@download_or_link[DataSection:test,side:trg]
-
-  tokenized_data=(DoTokenize:
-    yes=$out@tokenize
-    no=$out@download_or_link
-   )
-
-  # don't do truecase when doing characterize
-  # truecase will mess up the word boundary annotation
-  truecased_data=(DoTruecase:
-    yes=$out@truecase
-    no=$tokenized_data
-  )
-
-  prepared_data=(SubwordMethod:
-    sentencepiece=$out@apply_sentencepiece
-    bpe=$out@apply_bpe
-    none=$truecased_data
-  )
-
-  debped_output=(SubwordMethod:
-    sentencepiece=$out@remove_sentencepiece
-    bpe=$out@debpe
-    none=$out@decode
-  )
-
-  # don't do truecase when doing characterize
-  # truecase will mess up the word boundary annotation
-  detruecased_output=(DoTruecase:
-    yes=$out@detruecase
-    no=$debped_output
-  )
-
-  detokenized_output=(DoTokenize:
-    yes=$out@detokenize
-    no=$detruecased_output
-  )
-}
diff --git a/sockeye.tconf b/sockeye.tconf
index ff176d3..26f82bc 100644
--- a/sockeye.tconf
+++ b/sockeye.tconf
@@ -1,5 +1,4 @@
 import "confs/data.tconf"
-import "confs/pipeline.tconf"
 
 global {
   ##################################################################################################
diff --git a/tapes/download.tape b/tapes/download.tape
index dca3973..370c0b8 100644
--- a/tapes/download.tape
+++ b/tapes/download.tape
@@ -1,3 +1,20 @@
+global {
+
+  raw_data=$out@download_or_link
+  raw_data_test_src=$out@download_or_link[DataSection:test,side:src]
+  raw_data_test_trg=$out@download_or_link[DataSection:test,side:trg]
+
+  dev_text=(SgmDev:
+    no=$out@download_or_link[DataSection:dev]
+    yes=(side: src=$src_out@dev_text_from_sgm trg=$trg_out@dev_text_from_sgm)
+  )
+
+  test_text=(SgmTest:
+    no=$out@download_or_link[DataSection:test]
+    yes=(side: src=$src_out@test_text_from_sgm trg=$trg_out@test_text_from_sgm)
+  )
+}
+
 task download_or_link : tools
     :: in=(DataSection: train=$train_prefix dev=$dev_prefix test=$test_prefix)
     :: side=(side: src trg)
diff --git a/tapes/postprocessing.tape b/tapes/postprocessing.tape
index a147a6f..8dbe7c0 100644
--- a/tapes/postprocessing.tape
+++ b/tapes/postprocessing.tape
@@ -1,3 +1,25 @@
+global {
+
+  debped_output=(SubwordMethod:
+    sentencepiece=$out@remove_sentencepiece
+    bpe=$out@debpe
+    none=$out@decode
+  )
+
+  # don't do truecase when doing characterize
+  # truecase will mess up the word boundary annotation
+  detruecased_output=(DoTruecase:
+    yes=$out@detruecase
+    no=$debped_output
+  )
+
+  detokenized_output=(DoTokenize:
+    yes=$out@detokenize
+    no=$detruecased_output
+  )
+
+}
+
 task debpe
     < in=$out@decode
     > out
diff --git a/tapes/subword.tape b/tapes/subword.tape
index a03ed9c..e673969 100644
--- a/tapes/subword.tape
+++ b/tapes/subword.tape
@@ -1,3 +1,11 @@
+global {
+  prepared_data=(SubwordMethod:
+    sentencepiece=$out@apply_sentencepiece
+    bpe=$out@apply_bpe
+    none=$truecased_data
+  )
+}
+
 task train_bpe : subword_nmt
   < src_in=$tokenized_data[DataSection:train,side:src]
   < trg_in=$tokenized_data[DataSection:train,side:trg]
diff --git a/tapes/tokenize.tape b/tapes/tokenize.tape
index 2f747dc..35e9fdf 100644
--- a/tapes/tokenize.tape
+++ b/tapes/tokenize.tape
@@ -1,3 +1,10 @@
+global {
+  tokenized_data=(DoTokenize:
+    yes=$out@tokenize
+    no=$out@download_or_link
+   )
+}
+
 func tokenize : mosesdecoder tools # stanford_seg
     < in
     > out
diff --git a/tapes/truecase.tape b/tapes/truecase.tape
index 0d87a65..37e6873 100644
--- a/tapes/truecase.tape
+++ b/tapes/truecase.tape
@@ -1,3 +1,21 @@
+global {
+  src_truecaser=(UseExistingTruecaser:
+    yes=""
+    no=$out@train_truecaser[side:src]
+  )
+  trg_truecaser=(UseExistingTruecaser:
+    yes=""
+    no=$out@train_truecaser[side:trg]
+  )
+
+  # don't do truecase when doing characterize
+  # truecase will mess up the word boundary annotation
+  truecased_data=(DoTruecase:
+    yes=$out@truecase
+    no=$tokenized_data
+  )
+}
+
 task train_truecaser : mosesdecoder
     < src_in=$prepared_data[DataSection:train,side:src]
     < trg_in=$prepared_data[DataSection:train,side:trg]

From 6334bc699af8f06530a319bc769a25bd88f27ac9 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 14 Dec 2018 13:35:59 -0500
Subject: [PATCH 8/8] fix

---
 sockeye.tconf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sockeye.tconf b/sockeye.tconf
index 26f82bc..2ac276c 100644
--- a/sockeye.tconf
+++ b/sockeye.tconf
@@ -43,7 +43,7 @@ global {
   resource_flags_decode="-q gpu.q -l gpu=1,mem_free=4g"
 
   # SGE: flags for notifying about job completion (put in your email address!)
-  action_flags="-m ae -M post@cs.jhu.edu"
+  action_flags="-m ae -M YOUR@EMAIL.ADDRESS"
 
   # The default submitter: shell (run locally) or sge (run on a grid)
   submitter=(TestMode: no="sge" yes="shell")