No need for a validation split, if eval_holdout_size has been specified.

afrozenator · copybara-github · commit 675eae165a94 · 2021-05-21T11:26:50.000-07:00
PiperOrigin-RevId: 375127276
diff --git a/trax/data/inputs.py b/trax/data/inputs.py
@@ -153,9 +153,9 @@ def Parallel(  # pylint: disable=invalid-name
     # Remove generators with zero counters
     counters = list(counters)
     fns = list(fns)
-    zeros = [j for j in range(len(counters)) if counters[j] != 0]
-    counters = [counters[j] for j in zeros]
-    fns = [fns[j] for j in zeros]
+    non_zeros = [j for j in range(len(counters)) if counters[j] != 0]
+    counters = [counters[j] for j in non_zeros]
+    fns = [fns[j] for j in non_zeros]
   else:
     counters = [1] * len(fns)
 
diff --git a/trax/data/testdata/para_crawl/ende/1.2.0/dataset_info.json b/trax/data/testdata/para_crawl/ende/1.2.0/dataset_info.json
@@ -0,0 +1,27 @@
+{
+  "citation": "@misc {paracrawl,\n    title  = \"ParaCrawl\",\n    year   = \"2018\",\n    url    = \"http://paracrawl.eu/download.html.\"\n}",
+  "configDescription": "Translation dataset from English to de.",
+  "configName": "ende",
+  "description": "Web-Scale Parallel Corpora for Official European Languages.",
+  "downloadSize": "1307754745",
+  "location": {
+    "urls": [
+      "https://paracrawl.eu/releases.html"
+    ]
+  },
+  "name": "para_crawl",
+  "splits": [
+    {
+      "name": "train",
+      "numBytes": "3241",
+      "shardLengths": [
+        "10"
+      ]
+    }
+  ],
+  "supervisedKeys": {
+    "input": "en",
+    "output": "de"
+  },
+  "version": "1.2.0"
+}
diff --git a/trax/data/testdata/para_crawl/ende/1.2.0/features.json b/trax/data/testdata/para_crawl/ende/1.2.0/features.json
@@ -0,0 +1,9 @@
+{
+    "type": "tensorflow_datasets.core.features.translation_feature.Translation",
+    "content": {
+        "languages": [
+            "de",
+            "en"
+        ]
+    }
+}
diff --git a/trax/data/testdata/para_crawl/ende/1.2.0/para_crawl-train.tfrecord-00000-of-00001 b/trax/data/testdata/para_crawl/ende/1.2.0/para_crawl-train.tfrecord-00000-of-00001
diff --git a/trax/data/tf_inputs.py b/trax/data/tf_inputs.py
@@ -226,6 +226,7 @@ def _train_and_eval_dataset(dataset_name,
   if dataset_name != 'c4/multilingual' and tfds.Split.TRAIN not in splits:
     raise ValueError('To train we require a train split in the dataset.')
   train_split = tfds.Split.TRAIN if dataset_name != 'c4/multilingual' else 'en'
+  eval_split = None
   train_examples = info.splits[train_split].num_examples
   eval_holdout_examples = int(train_examples * eval_holdout_size)
   if eval_holdout_examples > 0 or subsplit is not None:
@@ -248,7 +249,7 @@ def _train_and_eval_dataset(dataset_name,
         'validation_mismatched' if use_alt_eval else 'validation_matched')
   elif dataset_name == 'c4/multilingual':
     eval_split = 'en-validation'
-  else:
+  elif eval_split is None:
     if tfds.Split.VALIDATION not in splits and 'test' not in splits:
       raise ValueError('We require a validation or test split in the dataset.')
     eval_split = tfds.Split.VALIDATION
diff --git a/trax/data/tf_inputs_test.py b/trax/data/tf_inputs_test.py
@@ -143,6 +143,37 @@ def test_TFDS_single_host_with_eval_holdout(self):
       print(f'Eval: {d}')
       break
 
+  def test_TFDS_single_host_with_eval_holdout_no_valid_split(self):
+    train_ds_gen = tf_inputs.TFDS(
+        'para_crawl/ende',
+        data_dir=_TESTDATA,
+        train=True,
+        host_id=0,
+        keys=('en', 'de'),
+        n_hosts=1,
+        eval_holdout_size=0.1)
+
+    # Just ensure that this doesn't crash.
+    for d in train_ds_gen():
+      print(f'Train: {d}')
+      break
+
+    # para_crawl doesn't have a validation set, see that this still doesn't
+    # crash because of eval_holdout_set.
+    valid_ds_gen = tf_inputs.TFDS(
+        'para_crawl/ende',
+        data_dir=_TESTDATA,
+        train=False,
+        host_id=0,
+        keys=('en', 'de'),
+        n_hosts=1,
+        eval_holdout_size=0.1)
+
+    # Just ensure that this doesn't crash.
+    for d in valid_ds_gen():
+      print(f'Eval: {d}')
+      break
+
   def test_TFDS_mnli_split_is_eval(self):
     with mock.patch('tensorflow_datasets.load') as tfds_load:
       with mock.patch('trax.data.tf_inputs.download_and_prepare',