rwth-i6 · albertz · Mar 5, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
@@ -57,7 +57,7 @@ def run_model(
     dyn_dim_min_sizes: Optional[Dict[Dim, int]] = None,
     test_tensorflow: bool = True,
     allow_inf_nan_in_output: bool = False,
-    test_single_batch_entry: bool = False,  # can later enable this globally
+    test_single_batch_entry: bool = True,
 ) -> TensorDict:
     """run"""
     print(f"* run_model with dyn_dim_max_sizes={dyn_dim_max_sizes!r}")

@@ -364,7 +364,8 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+    # Note: The tested op here is a bit meaningless. It also is not consinstent for different batch sizes...
+    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, test_single_batch_entry=False)
 
 
 def test_expand_dim():
@@ -791,7 +792,7 @@ def _forward_step(*, extern_data: TensorDict, **_kwargs):
         out = rf.reverse_sequence(extern_data["data"], axis=time_dim, handle_dynamic_dims=False)
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
 
-    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step, test_single_batch_entry=False)
 
 
 def test_where():
@@ -877,7 +878,7 @@ def _forward_step(*, model: rf.Conv1d, extern_data: TensorDict):
         x, _ = rf.pool1d(x, mode="avg", pool_size=3, strides=1, padding="same", in_spatial_dim=time_dim)
         x.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
 
-    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step, test_single_batch_entry=False)
 
 
 def test_cast_sparse():

@@ -38,7 +38,7 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, test_single_batch_entry=False)
 
 
 def test_cond_via_time_even():
@@ -69,8 +69,20 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
 
 
 def test_cond_shared_params():
@@ -100,8 +112,20 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
 
 
 def test_cond_twice_shared_params():
@@ -140,8 +164,20 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
 
 
 def test_cond_param_assign():
@@ -173,8 +209,20 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=())
 
-    out1 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    out2 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    out1 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    out2 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
     assert out1["output"].raw_tensor == 2
     assert out2["output"].raw_tensor == 5
 
@@ -208,8 +256,20 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=())
 
-    out1 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    out2 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    out1 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    out2 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
     assert out1["output"].raw_tensor == 9
     assert out2["output"].raw_tensor == 5
 
@@ -246,8 +306,20 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out.mark_as_default_output(shape=())
         param.mark_as_output(shape=(), name="param")
 
-    out1 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    out2 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    out1 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    out2 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
     assert out1["output"].raw_tensor == 6 and out1["param"].raw_tensor == 2
     assert out2["output"].raw_tensor == 42 and out2["param"].raw_tensor == 5
 

@@ -341,7 +341,7 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         # Note: Currently not the single batch test because there is another problem with RF PT pool,
         # which does not correctly handle this case. We get:
         #   RuntimeError: max_pool1d() Invalid computed output size: -1
-        # test_single_batch_entry=True,
+        test_single_batch_entry=False,
     )
     out = out["output"]
     (out_spatial_dim,) = out.get_dyn_size_tags()

@@ -66,20 +66,28 @@ def test_while_loop():
 
     class _Net(rf.Module):
         def __call__(self, x: Tensor) -> Tensor:
-            def _cond(s: Tuple[Tensor, Tensor]):
-                t, s_ = s
+            def _cond(s: Tuple[Tensor, Tensor, Tensor]) -> Tensor:
+                t, ended, s_ = s
                 if t.raw_tensor.__class__.__module__.startswith("torch"):
-                    print("**", t.raw_tensor, rf.reduce_sum(s_, axis=s_.dims).raw_tensor)
-                return rf.logical_and(rf.reduce_sum(s_, axis=s_.dims) < 50, t < time_dim.get_dim_value_tensor())
+                    print("**", t.raw_tensor, ended.raw_tensor, rf.reduce_sum(s_, axis=in_dim).raw_tensor)
+                return rf.logical_not(rf.reduce_all(ended, axis=[batch_dim]))
 
             def _body(s):
-                t, s_ = s
-                return t + 1, s_ + rf.abs(rf.gather(x, indices=t, axis=time_dim))
-
-            _, final_s = rf.while_loop(
+                t, ended, s_ = s
+                cont = rf.logical_and(rf.reduce_sum(s_, axis=in_dim) < 50, t < time_dim.get_size_tensor())
+                ended = rf.logical_or(ended, rf.logical_not(cont))
+                s__ = s_ + rf.abs(rf.gather(x, indices=t, axis=time_dim, clip_to_valid=True))
+                s__ = rf.where(ended, s_, s__)
+                return t + 1, ended, s__
+
+            _, _, final_s = rf.while_loop(
                 _cond,
                 _body,
-                initial=(rf.zeros((), dtype=rf.get_default_array_index_dtype()), rf.zeros((batch_dim, in_dim))),
+                initial=(
+                    rf.zeros((), dtype=rf.get_default_array_index_dtype()),  # t
+                    rf.zeros((batch_dim,), dtype="bool"),  # ended
+                    rf.zeros((batch_dim, in_dim)),  # s
+                ),
             )
             return final_s
 
@@ -209,4 +217,7 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out, beam_dim = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, beam_dim, in_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, test_tensorflow=False)
+    # TODO the way this is implemented, accessing y[-1], is not consistent w.r.t. different batch sizes...
+    run_model(
+        extern_data, lambda *, epoch, step: _Net(), _forward_step, test_tensorflow=False, test_single_batch_entry=False
+    )
@@ -36,6 +36,8 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
 
+    # Note: no test_single_batch_entry=False needed here because we currently don't check the running stats,
+    # and the output currently uses the initial running stats, i.e. should be the same for all batches.
     run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
 
 
@@ -62,4 +64,11 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
 
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        # BatchNorm by definition uses the batch dim.
+        # Needed here because track_running_stats=False and thus use_current_batch_stats=True.
+        test_single_batch_entry=False,
+    )
@@ -16,6 +16,8 @@ def test_lstm():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, time_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32"),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32"),
             "classes": Tensor("classes", [batch_dim, time_dim], dtype="int32", sparse_dim=out_dim),
         }
     )
@@ -32,10 +34,7 @@ def __call__(self, x: Tensor, *, spatial_dim: Dim, state: rf.LstmState) -> Tuple
 
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=time_dim)
         out.mark_as_output("out", shape=(batch_dim, time_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))
@@ -49,6 +48,8 @@ def test_lstm_single_step():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32"),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32"),
         }
     )
 
@@ -64,10 +65,7 @@ def __call__(self, x: Tensor, *, spatial_dim: Dim, state: rf.LstmState) -> Tuple
 
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=single_step_dim)
         out.mark_as_output("out", shape=(batch_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))
@@ -82,6 +80,8 @@ def test_zoneout_lstm():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, time_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
             "classes": Tensor("classes", [batch_dim, time_dim], dtype="int32", sparse_dim=out_dim),
         }
     )
@@ -103,10 +103,7 @@ def __call__(self, x: Tensor, *, spatial_dim: Dim, state: rf.LstmState) -> Tuple
 
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=time_dim)
         out.mark_as_output("out", shape=(batch_dim, time_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))
@@ -121,6 +118,8 @@ def test_zoneout_lstm_single_step():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32"),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32"),
         }
     )
 
@@ -141,10 +140,7 @@ def __call__(self, x: Tensor, *, spatial_dim: Dim, state: rf.LstmState) -> Tuple
 
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=single_step_dim)
         out.mark_as_output("out", shape=(batch_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))