From 28f0b0b651f31096d46be7b14c0dcc5812de7158 Mon Sep 17 00:00:00 2001
From: Joan Puigcerver <jpuigcerver@google.com>
Date: Fri, 5 Apr 2024 05:07:03 -0700
Subject: [PATCH] Release config and code necessary to pretrain on LAION-400M.

PiperOrigin-RevId: 622146719
---
 vmoe/projects/contrastive/evaluators.py       | 169 +++++++
 vmoe/projects/contrastive/models.py           | 186 ++++++++
 vmoe/projects/contrastive/models_test.py      | 109 +++++
 vmoe/projects/contrastive/trainer.py          | 414 ++++++++++++++++++
 vmoe/projects/soft_moe/README.md              |   9 +-
 .../soft_moe/configs/pretrain_laion.py        | 188 ++++++++
 vmoe/projects/soft_moe/main_contrastive.py    |  20 +
 7 files changed, 1094 insertions(+), 1 deletion(-)
 create mode 100644 vmoe/projects/contrastive/evaluators.py
 create mode 100644 vmoe/projects/contrastive/models.py
 create mode 100644 vmoe/projects/contrastive/models_test.py
 create mode 100644 vmoe/projects/contrastive/trainer.py
 create mode 100644 vmoe/projects/soft_moe/configs/pretrain_laion.py
 create mode 100644 vmoe/projects/soft_moe/main_contrastive.py

diff --git a/vmoe/projects/contrastive/evaluators.py b/vmoe/projects/contrastive/evaluators.py
new file mode 100644
index 0000000..11ee1d9
--- /dev/null
+++ b/vmoe/projects/contrastive/evaluators.py
@@ -0,0 +1,169 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluators used during contrastive training."""
+import time
+from typing import Any, Callable, Iterable, Optional, Tuple
+
+from clu import metric_writers
+from clu import periodic_actions
+import jax
+
+# pylint: disable=g-import-not-at-top
+try:
+  from big_vision.evaluators.proj.image_text import discriminative_classifier as bv_discriminative
+except ImportError:
+  bv_discriminative = None
+
+
+try:
+  from big_vision.evaluators.proj.image_text import retrieval as bv_retrieval
+except ImportError:
+  bv_retrieval = None
+# pylint: enable=g-import-not-at-top
+
+Array = jax.Array
+PyTree = Any
+
+
+class ZeroShotPeriodicAction(periodic_actions.PeriodicCallback):
+  """Periodic action that runs Big Vision's Retrieval evaluator repeatedly."""
+
+  def __init__(
+      self,
+      *,
+      metric_writer: metric_writers.MetricWriter,
+      apply_fn: Callable[..., Tuple[Array, Array, Any]],
+      data_sharding: jax.sharding.NamedSharding,
+      every_steps: Optional[int] = None,
+      every_secs: Optional[float] = None,
+      on_steps: Optional[Iterable[int]] = None,
+      report_progress: Optional[periodic_actions.ReportProgress] = None,
+      report_progress_name: str = 'zeroshot',
+      **bv_evaluator_kwargs,
+  ):
+    """Constructor."""
+    if bv_discriminative is None:
+      raise NotImplementedError(
+          'Big Vision must be installed to run the discriminative evaluation.')
+    bv_evaluator = bv_discriminative.Evaluator(
+        predict_fn=apply_fn,
+        devices=list(data_sharding.mesh.devices.flatten()),
+        **bv_evaluator_kwargs,
+    )
+    callback = self._make_callback_fn(
+        evaluator=bv_evaluator,
+        metric_writer=metric_writer,
+        report_progress=report_progress,
+        report_progress_name=report_progress_name,
+    )
+    super().__init__(
+        every_steps=every_steps,
+        every_secs=every_secs,
+        on_steps=on_steps,
+        callback_fn=callback,
+        execute_async=False,
+        pass_step_and_time=True)
+
+  def _make_callback_fn(
+      self, *, evaluator, metric_writer, report_progress,
+      report_progress_name):
+
+    def callback_fn(step: int, t: Optional[float], variables: PyTree, **kwargs):
+      del t  # Unused.
+      metrics = {}
+      t0 = time.time()
+      for task in evaluator.datasets:
+        acc = evaluator.evaluate(variables, task)['accuracy']
+        t1 = time.time()
+        metrics[f'{report_progress_name}/{task}/accuracy'] = acc
+        metrics[f'{report_progress_name}/{task}/duration_secs'] = t1 - t0
+      metrics = metrics | {k: v for k, v in kwargs.items() if v is not None}
+      metric_writer.write_scalars(step, metrics)
+
+    if report_progress is None:
+      return callback_fn
+    else:
+      return report_progress.timed(
+          report_progress_name, wait_jax_async_dispatch=False)(callback_fn)
+
+
+class RetrievalPeriodicAction(periodic_actions.PeriodicCallback):
+  """Periodic action that runs Big Vision's Retrieval evaluator repeatedly."""
+
+  def __init__(
+      self,
+      *,
+      metric_writer: metric_writers.MetricWriter,
+      apply_fn: Callable[..., Tuple[Array, Array, Any]],
+      task: str,
+      data_sharding: jax.sharding.NamedSharding,
+      every_steps: Optional[int] = None,
+      every_secs: Optional[float] = None,
+      on_steps: Optional[Iterable[int]] = None,
+      report_progress: Optional[periodic_actions.ReportProgress] = None,
+      report_progress_name: str = 'retrieval',
+      **bv_evaluator_kwargs,
+  ):
+    """Constructor."""
+    if bv_retrieval is None:
+      raise NotImplementedError(
+          'Big Vision must be installed to run the retrieval evaluation.')
+    bv_evaluator = bv_retrieval.Evaluator(
+        predict_fn=apply_fn,
+        devices=list(data_sharding.mesh.devices.flatten()),
+        **bv_evaluator_kwargs,
+    )
+    callback = self._make_callback_fn(
+        evaluator=bv_evaluator,
+        task=task,
+        metric_writer=metric_writer,
+        report_progress=report_progress,
+        report_progress_name=report_progress_name,
+    )
+    super().__init__(
+        every_steps=every_steps,
+        every_secs=every_secs,
+        on_steps=on_steps,
+        callback_fn=callback,
+        execute_async=False,
+        pass_step_and_time=True)
+
+  def _make_callback_fn(
+      self, *, evaluator, task, metric_writer, report_progress,
+      report_progress_name):
+
+    def callback_fn(step: int, t: Optional[float], variables: PyTree, **kwargs):
+      del t  # Unused.
+      metrics = {}
+      t0 = time.time()
+      bv_metrics = evaluator.evaluate(variables)
+      metrics.update({
+          f'{report_progress_name}/{task}/txt2img/{k}': v
+          for k, v in bv_metrics['txt2img'].items()
+      })
+      metrics.update({
+          f'{report_progress_name}/{task}/img2txt/{k}': v
+          for k, v in bv_metrics['img2txt'].items()
+      })
+      t1 = time.time()
+      metrics[f'{report_progress_name}/{task}/duration_secs'] = t1 - t0
+      metrics = metrics | {k: v for k, v in kwargs.items() if v is not None}
+      metric_writer.write_scalars(step, metrics)
+
+    if report_progress is None:
+      return callback_fn
+    else:
+      return report_progress.timed(
+          report_progress_name, wait_jax_async_dispatch=False)(callback_fn)
diff --git a/vmoe/projects/contrastive/models.py b/vmoe/projects/contrastive/models.py
new file mode 100644
index 0000000..bcf5764
--- /dev/null
+++ b/vmoe/projects/contrastive/models.py
@@ -0,0 +1,186 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Two tower model used for contrastive learning."""
+import functools
+import sys
+from typing import Any, Mapping, Literal, Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from vmoe import utils
+from vmoe.nn import vit_moe
+
+Array = jax.Array
+
+_default_image_module = vit_moe
+_default_text_module = sys.modules[__name__]
+
+
+class TextTransformer(nn.Module):
+  """Text transformer similar to CLIP, allowing blocks with MoEs."""
+
+  # Differences to CLIP text encoder (gpt-2) that I am aware of:
+  # 1. https://imgur.com/HNi3jix (gpt-1)
+  # 2. https://imgur.com/qKGZgBR (gpt-2)
+  # 3. https://imgur.com/a/xrpYHF0 (clip)
+  # - LayerNorm is on res-path (like pre-activation resnet)
+  # - dropout 0.1 everywhere
+  # - init as var=0.02, scaled by depth
+  # - BOS and EOS tokens, take repr from EOS.
+  # - self-attention is autoregressively masked.
+  # - scaled in width only, with the image model.
+  vocab_size: int
+  num_classes: Optional[int]
+  hidden_size: int
+  encoder: Mapping[str, Any]
+  pool_type: Literal['last', 'first', 'gap', 'gmp', 'map'] = 'last'
+  deterministic: bool = False
+  head_bias_init: float = 0.0
+  head_kernel_zero_init: bool = False
+
+  @property
+  def kernel_init(self) -> nn.initializers.Initializer:
+    if self.head_kernel_zero_init:
+      return nn.initializers.zeros
+    else:
+      return nn.linear.default_kernel_init
+
+  @nn.compact
+  def __call__(self, text):
+    # We can't use where/argwhere since the output shape is not fixed.
+    # Here we use the fact that sequences are padded with EOS tokens, that the
+    # EOS token has value 1, and that argmin returns the first index.
+    # eos_indices = jnp.argmin(text, axis=1)
+
+    embedding = nn.Embed(
+        num_embeddings=self.vocab_size, features=self.hidden_size)
+    x = embedding(text)
+
+    # TODO(jpuigcerver): Move position embedding outside of the Encoder class.
+    encoder_kwargs = dict(self.encoder)
+    if encoder_kwargs.get('position_emb', {}).get('name') == 'sincos2d':
+      raise ValueError(
+          'sincos2d position embeddings are not supproted for text.')
+
+    x, metrics = vit_moe.EncoderMoe(
+        name='Encoder', deterministic=self.deterministic, **encoder_kwargs)(x)
+
+    x = self.apply_pooling(x)
+
+    if self.num_classes:
+      # Linear head outputing the logits for classification.
+      logits = nn.Dense(
+          features=self.num_classes,
+          name='head',
+          kernel_init=self.kernel_init,
+          bias_init=nn.initializers.constant(self.head_bias_init))(x)
+      return logits, metrics
+    else:
+      return x, metrics
+
+  @nn.nowrap
+  def apply_pooling(self, x):
+    match self.pool_type:
+      case 'last': return x[:, -1, :]
+      case 'first': return x[:, 0, :]
+      case 'gap': return x.mean(axis=1)
+      case 'gmp': return x.max(axis=1)
+      case 'map':
+        return vit_moe.MapHead(
+            num_heads=self.encoder['num_heads'],
+            mlp_dim=self.encoder['mlp_dim'],
+            qk_norm=self.encoder.get('attention_qk_norm', False),
+            name='MapHead')(x)
+      case _:
+        raise NotImplementedError(f'Cannot do pooling {self.pool_type!r}')
+
+
+class TwoTower(nn.Module):
+  """A two-tower encoder model."""
+  image: Mapping[str, Any]
+  text: Mapping[str, Any]
+  scale_init: float = 1.0
+  bias_init: float | None = None
+  deterministic: bool = False
+
+  @functools.cached_property
+  def image_model_class(self):
+    # Default model for the image encoder is a Vision Transformer with MoEs.
+    model_cls = self.image.get('name', 'VisionTransformerMoe')
+    model_cls, args, kwargs = utils.parse_call(model_cls, _default_image_module)
+    kwargs.update({k: v for k, v in self.image.items() if k != 'name'})
+    return functools.partial(
+        model_cls, *args, **kwargs, deterministic=self.deterministic)
+
+  @functools.cached_property
+  def text_model_class(self):
+    # Default model for the text encoder is a Text Transformer.
+    model_cls = self.text.get('name', 'TextTransformer')
+    model_cls, args, kwargs = utils.parse_call(model_cls, _default_text_module)
+    kwargs.update({k: v for k, v in self.text.items() if k != 'name'})
+    return functools.partial(
+        model_cls, *args, **kwargs, deterministic=self.deterministic)
+
+  @nn.compact
+  def __call__(
+      self,
+      images: Array | None,
+      texts: Array | None,
+  ) -> Tuple[Array, Mapping[str, Any]]:
+    if images is None and texts is None:
+      raise ValueError('You must give at least one of images or texts arrays.')
+    zimg, ztxt, metrics = None, None, {}
+
+    if images is not None:
+      zimg, metrics_img = self.image_model_class(name='img')(images)
+      zimg_norm = jnp.linalg.norm(zimg, axis=-1, keepdims=True)
+      zimg /= zimg_norm + 1e-8
+      self.sow('intermediates', 'zimg', zimg)
+      metrics['img'] = metrics_img
+
+    if texts is not None:
+      ztxt, metrics_txt = self.text_model_class(name='txt')(texts)
+      ztxt_norm = jnp.linalg.norm(ztxt, axis=-1, keepdims=True)
+      ztxt /= ztxt_norm + 1e-8
+      self.sow('intermediates', 'ztxt', ztxt)
+      metrics['txt'] = metrics_txt
+
+    if images is None:
+      # Return text embeddings and metrics.
+      return ztxt, metrics
+    elif texts is None:
+      # Return image embeddings and metrics.
+      return zimg, metrics
+    else:
+      # Compute logits as the dot product of the image and text embeddings.
+      logits = jnp.einsum('...md,...nd->...mn', zimg, ztxt)
+
+      # Note: Big Vision calls this "temperature", but it's actually
+      # 1/temperature, if one uses the standard definition of temperature.
+      scale_init = jnp.log(self.scale_init)
+      s = self.param('s', nn.initializers.constant(scale_init),
+                     (), jnp.float32).astype(logits.dtype)
+      s = jnp.exp(s)
+      logits *= s
+      metrics['scale'] = s
+
+      if self.bias_init is not None:
+        b = self.param('b', nn.initializers.constant(self.bias_init),
+                       (), jnp.float32).astype(logits.dtype)
+        logits += b
+
+      # Return the logits and the metrics.
+      return logits, metrics
diff --git a/vmoe/projects/contrastive/models_test.py b/vmoe/projects/contrastive/models_test.py
new file mode 100644
index 0000000..3b59265
--- /dev/null
+++ b/vmoe/projects/contrastive/models_test.py
@@ -0,0 +1,109 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from absl.testing import absltest
+import chex
+import jax
+import jax.numpy as jnp
+from vmoe.projects.contrastive import models
+
+
+class TwoTowerTest(absltest.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.image_config = {
+        'num_classes': self.output_dim,
+        'patch_size': (2, 2),
+        'hidden_size': 64,
+        'classifier': 'gap',
+        'encoder': {
+            'num_layers': 2,
+            'mlp_dim': 256,
+            'num_heads': 2,
+        },
+        'head_kernel_zero_init': True,
+    }
+    self.text_config = {
+        'num_classes': self.output_dim,
+        'hidden_size': 64,
+        'encoder': {
+            'num_layers': 2,
+            'mlp_dim': 256,
+            'num_heads': 2,
+        },
+        'vocab_size': 128,
+    }
+
+  @property
+  def output_dim(self) -> int:
+    return 32
+
+  def test(self):
+    """Tests initialization and forward pass."""
+    batch_size, height, width, text_len = 4, 8, 8, 16
+    model = models.TwoTower(
+        image=self.image_config,
+        text=self.text_config,
+        scale_init=2.0,
+        bias_init=1.0,
+    )
+
+    @jax.jit
+    def init_fn():
+      images = jnp.zeros((batch_size, height, width, 3), dtype=jnp.float32)
+      texts = jnp.zeros((batch_size, text_len), dtype=jnp.int32)
+      return model.init({'params': jax.random.PRNGKey(0)}, images, texts)
+
+    variables = init_fn()
+    self.assertIn('txt', variables['params'])
+    self.assertIn('img', variables['params'])
+    self.assertIn('s', variables['params'])
+    self.assertIn('b', variables['params'])
+    # Check shape and initial values for scale and bias params.
+    chex.assert_trees_all_close(
+        variables['params']['s'], jnp.log(jnp.asarray(2., dtype=jnp.float32)))
+    chex.assert_trees_all_close(
+        variables['params']['b'], jnp.asarray(1., dtype=jnp.float32))
+
+    @jax.jit
+    def forward(variables, images, text):
+      return model.apply(variables, images, text)
+
+    # Forward with both images and text embeddings, logits' shape must be
+    # (batch_size, batch_size).
+    images = jnp.zeros((batch_size, height, width, 3), dtype=jnp.float32)
+    texts = jnp.zeros((batch_size, text_len), dtype=jnp.int32)
+    logits, _ = forward(variables, images, texts)
+    chex.assert_trees_all_equal_shapes_and_dtypes(
+        logits,
+        jax.ShapeDtypeStruct((batch_size, batch_size), jnp.float32))
+
+    # Forward only images: the output should be all 0s, since the image head
+    # kernel is initialized with 0.
+    zimg, _ = forward(variables, images, None)
+    chex.assert_trees_all_close(
+        zimg, jnp.zeros((batch_size, self.output_dim), jnp.float32))
+
+    # Forward only texts: the output should be different than 0s, since the text
+    # head kernel is NOT initialized with 0s.
+    ztxt, _ = forward(variables, None, texts)
+    chex.assert_trees_all_equal_shapes_and_dtypes(
+        ztxt,
+        jax.ShapeDtypeStruct((batch_size, self.output_dim), jnp.float32))
+    self.assertGreater(jnp.abs(ztxt).sum(), 0.)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/vmoe/projects/contrastive/trainer.py b/vmoe/projects/contrastive/trainer.py
new file mode 100644
index 0000000..cb071b7
--- /dev/null
+++ b/vmoe/projects/contrastive/trainer.py
@@ -0,0 +1,414 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classes and functions used for training (from-scratch and fine-tuning)."""
+import functools
+import multiprocessing.pool
+import os
+import time
+from typing import Any, Callable, Mapping, Optional, Sequence, Tuple
+
+from absl import logging
+from clu import metric_writers
+import flax
+import flax.serialization
+import flax.training.train_state
+import flax.traverse_util
+import jax
+import jax.numpy as jnp
+import ml_collections
+import tensorflow as tf
+from vmoe import multihost_utils
+from vmoe import partitioning
+from vmoe import utils
+from vmoe.data import input_pipeline
+from vmoe.data import pjit_utils
+from vmoe.evaluate import fewshot
+from vmoe.projects.contrastive import evaluators
+from vmoe.train import periodic_actions as train_periodic_actions
+from vmoe.train import train_state as train_state_module
+from vmoe.train import trainer
+from vmoe.train import tree_summarizer
+
+
+Array = jax.numpy.ndarray
+DatasetIterator = input_pipeline.DatasetIterator
+Mesh = partitioning.Mesh
+ReportProgress = train_periodic_actions.ReportProgress
+ThreadPool = multiprocessing.pool.ThreadPool
+TrainState = train_state_module.TrainState
+TreeSummarizer = tree_summarizer.TreeSummarizer
+
+accumulate_gradients_and_metrics = trainer.accumulate_gradients_and_metrics
+create_checkpoint_manager = trainer.create_checkpoint_manager
+create_flax_model = trainer.create_flax_model
+create_profile_hook = trainer.create_profile_hook
+create_progress_hook = trainer.create_progress_hook
+create_tree_summarizer = trainer.create_tree_summarizer
+get_dataset_iterator = trainer.get_dataset_iterator
+get_train_steps_and_epochs = trainer.get_train_steps_and_epochs
+make_create_train_state_fn = trainer.make_create_train_state_fn
+make_train_cost_fn = trainer.make_train_cost_fn
+override_base_config = trainer.override_base_config
+restore_or_create_train_state = trainer.restore_or_create_train_state
+
+
+def create_fewshot_hook(
+    *,
+    base_model_config: ml_collections.ConfigDict,
+    writer: metric_writers.MetricWriter,
+    progress_hook: ReportProgress,
+    first_step: int,
+    train_steps: int,
+    extra_rng_keys: Sequence[str],
+    model_overrides: Optional[ml_collections.ConfigDict] = None,
+    **kwargs) -> Callable[..., Any]:
+  """Returns a hook to run fewshot evaluation of a model periodically."""
+  model_config = override_base_config(base_model_config, model_overrides)
+  # Few-shot eval requires additional mandatory parameters. If none of those is
+  # given, we assume that no few-shot eval should be done.
+  if not kwargs:
+    return (lambda *args, **kw: None)
+  model = create_flax_model(
+      config=model_config.to_dict(), deterministic=True)
+  # Apply function only embeds images.
+  apply_fn = lambda p, x, **kw: model.apply(p, images=x, texts=None, **kw)
+  on_steps = set(kwargs.pop('on_steps', []))
+  # Always evaluate on the first and last step.
+  on_steps.update([first_step, train_steps])
+  periodic_action = fewshot.FewShotPeriodicAction(
+      metric_writer=writer,
+      apply_fn=apply_fn,
+      rng_keys=extra_rng_keys,
+      report_progress=progress_hook,
+      report_progress_name='fewshot',
+      on_steps=on_steps,
+      **kwargs)
+  return periodic_action
+
+
+def create_retrieval_hook(
+    *,
+    base_model_config: ml_collections.ConfigDict,
+    writer: metric_writers.MetricWriter,
+    progress_hook: ReportProgress,
+    first_step: int,
+    train_steps: int,
+    every_steps: Optional[int] = None,
+    every_secs: Optional[int] = None,
+    datasets: Optional[Mapping[str, Mapping[str, Any]]] = None,
+    model_overrides: Optional[ml_collections.ConfigDict] = None,
+    data_sharding: jax.sharding.NamedSharding,
+    **kwargs) -> Callable[..., Any]:
+  """Returns a hook to run retrieval evaluation of a model periodically."""
+  model_config = override_base_config(base_model_config, model_overrides)
+  model = create_flax_model(
+      config=model_config.to_dict(), deterministic=True)
+  # Always evaluate on the first and last step.
+  on_steps = set(kwargs.pop('on_steps', []))
+  on_steps.update([first_step, train_steps])
+
+  # Make the apply_fn function conform with Big Vision's evaluator expected
+  # inputs and outputs.
+  def apply_fn(v, input_dict):
+    img = input_dict.get('image')
+    txt = input_dict.get('labels')
+    if (img is None) == (txt is None):
+      raise ValueError('One and only of images or text must be None.')
+    z, _ = model.apply(v, images=img, texts=txt)
+    return (None, z, None) if img is None else (z, None, None)
+
+  datasets = datasets or {}
+  if isinstance(datasets, ml_collections.ConfigDict):
+    datasets = datasets.to_dict()
+  try:
+    # Instantiate hooks for each of the tasks to evaluate.
+    hooks = [
+        evaluators.RetrievalPeriodicAction(
+            metric_writer=writer,
+            apply_fn=apply_fn,
+            task=task,
+            data_sharding=data_sharding,
+            every_steps=every_steps,
+            every_secs=every_secs,
+            on_steps=on_steps,
+            report_progress=progress_hook,
+            **kwargs,
+            **bv_kw)
+        for task, bv_kw in datasets.items()
+    ]
+    def periodic_action(*a, **kw):
+      for hook in hooks:
+        hook(*a, **kw)
+    return periodic_action
+  except NotImplementedError as e:
+    logging.warning('%s', str(e))
+    return (lambda *a, **kw: None)
+
+
+def create_zeroshot_hook(
+    *,
+    base_model_config: ml_collections.ConfigDict,
+    writer: metric_writers.MetricWriter,
+    progress_hook: ReportProgress,
+    first_step: int,
+    train_steps: int,
+    every_steps: Optional[int] = None,
+    every_secs: Optional[int] = None,
+    datasets: Optional[Mapping[str, Mapping[str, Any]]] = None,
+    model_overrides: Optional[ml_collections.ConfigDict] = None,
+    data_sharding: jax.sharding.NamedSharding,
+    **kwargs) -> Callable[..., Any]:
+  """Returns a hook to run zeroshot evaluation of a model periodically."""
+  model_config = override_base_config(base_model_config, model_overrides)
+  model = create_flax_model(
+      config=model_config.to_dict(), deterministic=True)
+  # Always evaluate on the first and last step.
+  on_steps = set(kwargs.pop('on_steps', []))
+  on_steps.update([first_step, train_steps])
+
+  # Make the apply_fn function conform with Big Vision's evaluator expected
+  # inputs and outputs.
+  def apply_fn(v, input_dict):
+    img = input_dict.get('image')
+    txt = input_dict.get('labels')
+    if (img is None) == (txt is None):
+      raise ValueError('One and only of images or text must be None.')
+    z, _ = model.apply(v, images=img, texts=txt)
+    return (None, z, None) if img is None else (z, None, None)
+
+  datasets = datasets or {}
+  if isinstance(datasets, ml_collections.ConfigDict):
+    datasets = datasets.to_dict()
+  if not datasets:
+    return (lambda *a, **kw: None)
+
+  try:
+    return evaluators.ZeroShotPeriodicAction(
+        metric_writer=writer,
+        apply_fn=apply_fn,
+        data_sharding=data_sharding,
+        every_steps=every_steps,
+        every_secs=every_secs,
+        on_steps=on_steps,
+        report_progress=progress_hook,
+        dataset_names=tuple(datasets.keys()),
+        dataset_overrides=datasets,
+        **kwargs)
+  except NotImplementedError as e:
+    logging.warning('%s', str(e))
+    return (lambda *a, **kw: None)
+
+
+def sigmoid_loss(logits: Array):
+  if logits.ndim < 2 or logits.shape[-1] != logits.shape[-2]:
+    raise ValueError(
+        f'Last two dims of logits must be equal, but got {logits.shape=}')
+  # SigLIP loss, as described in https://arxiv.org/pdf/2303.15343.pdf.
+  # Positives are in the diagonal, negatives are off-diagonal.
+  z = 2. * jnp.eye(logits.shape[-1], dtype=logits.dtype) - 1.
+  log_lkh = jax.nn.log_sigmoid(jnp.einsum('...mn,mn->...mn', logits, z))
+  # Normalize by npos per column, but that's one, so just sum.
+  return -jnp.sum(log_lkh, axis=-1)
+
+
+def train_step(
+    state: TrainState,
+    images: Array,
+    texts: Array,
+    loss_fn: Callable[[Array], Array],
+    microsteps: Optional[int] = None,
+    summarizer: Optional[TreeSummarizer] = None,
+) -> Tuple[TrainState, Mapping[str, Any]]:
+  """Performs one update step of the given TrainState object ."""
+
+  @functools.partial(jax.grad, has_aux=True)
+  def compute_grads_and_metrics(params, images, texts, rngs):
+    rngs, next_rngs = utils.tree_rngs_split(rngs)
+    logits, metrics = state.apply_fn(
+        {'params': params}, images, texts, rngs=rngs)
+    metrics = dict(**metrics)
+    metrics['main_loss'] = jnp.mean(loss_fn(logits))
+    metrics = jax.tree_util.tree_map(jnp.mean, metrics)
+    total_loss = metrics['main_loss'] + metrics.get('auxiliary_loss', 0.0)
+    metrics['total_loss'] = total_loss
+    return total_loss, (next_rngs, metrics)
+
+  compute_grads_and_metrics = accumulate_gradients_and_metrics(
+      compute_grads_and_metrics, microsteps)
+  grads, (next_rngs, metrics) = compute_grads_and_metrics(
+      state.params, images, texts, state.rngs)
+  state, global_norms = state.apply_gradients_and_compute_global_norms(
+      grads, rngs=next_rngs)
+  metrics.update({f'global_norm/{k}': v for k, v in global_norms.items()})
+
+  if summarizer:
+    # Summarize arrays in the gradients tree or the train state.
+    state_flat = flax.traverse_util.flatten_dict(
+        flax.serialization.to_state_dict(state), sep='/')
+    state_flat['params_grads'] = flax.traverse_util.flatten_dict(grads, sep='/')
+    metrics.update(summarizer(state_flat))
+
+  return state, metrics
+
+
+def train_and_evaluate(config: ml_collections.ConfigDict, workdir: str,
+                       mesh: Mesh, writer: metric_writers.MetricWriter):
+  """Trains a model and evaluates it periodically."""
+  datasets = input_pipeline.get_datasets(config.dataset)
+  if 'train' not in datasets:
+    raise KeyError(f'You must have a "train" variant of the dataset. '
+                   f'Available variants are {sorted(datasets.keys())!r}')
+  train_examples = input_pipeline.get_data_num_examples(config.dataset.train)
+  train_batch_size = config.dataset.train.batch_size
+  train_steps, train_epochs = get_train_steps_and_epochs(
+      train_steps=config.get('train_steps'),
+      train_epochs=config.get('train_epochs'),
+      train_batch_size=train_batch_size,
+      train_examples=train_examples)
+  logging.info(
+      'Training for %d steps (%g epochs) over %d examples, with a '
+      'batch size of %d', train_steps, train_epochs, train_examples,
+      train_batch_size)
+
+  # Get the global shape of the image array.
+  dataset_element_shape_dtype = pjit_utils.get_dataset_shape_dtype_struct(
+      datasets['train'])
+
+  ckpt_manager = create_checkpoint_manager(
+      workdir=workdir, **config.get('save_checkpoint', {}))
+  train_state_initialize_fn = make_create_train_state_fn(
+      model=create_flax_model(config=config.model, deterministic=False),
+      optimizer_config=config.optimizer,
+      input_shape_dtypes=(dataset_element_shape_dtype['image'],
+                          dataset_element_shape_dtype['text']),
+      train_steps=train_steps,
+      extra_rng_keys=tuple(config.get('extra_rng_keys', [])),
+      seed=config.get('seed', 0))
+  train_state, last_seen_index = restore_or_create_train_state(
+      ckpt_manager=ckpt_manager,
+      initialize_fn=train_state_initialize_fn,
+      axis_resources_regexes=config.params_axis_resources,
+      thread_pool=ThreadPool(),
+      initialization_kwargs=config.get('initialization'))
+  init_step = int(train_state.step)
+  logging.info('Initial step = %d', init_step)
+  tr_iter = get_dataset_iterator(
+      dataset=datasets['train'],
+      prefetch_size=config.dataset.train.get('prefetch_device', 1),
+      mesh=mesh,
+      last_seen_index=last_seen_index)
+  summarizer = create_tree_summarizer(config.get('summarize_arrays'))
+  train_step_fn = functools.partial(
+      train_step,
+      loss_fn=sigmoid_loss,
+      microsteps=config.get('microsteps'),
+      summarizer=summarizer)
+
+  train_step_pjit = jax.jit(
+      fun=train_step_fn,
+      out_shardings=(
+          jax.tree_util.tree_map(lambda x: x.sharding, train_state),
+          None,
+      ),
+      donate_argnums=(0, 1, 2),
+  )
+
+  # Setup hooks.
+  profile_hook = create_profile_hook(
+      workdir=workdir, **config.get('profile', {}))
+  progress_hook = create_progress_hook(
+      writer=writer, first_step=init_step + 1, train_steps=train_steps,
+      **config.get('report_progress', {}))
+  fewshot_hook = create_fewshot_hook(
+      base_model_config=config.model.copy_and_resolve_references(),
+      writer=writer,
+      progress_hook=progress_hook,
+      first_step=init_step + 1,
+      train_steps=train_steps,
+      extra_rng_keys=config.get('extra_rng_keys', []),
+      **config.get('fewshot', {}))
+  retrieval_hook = create_retrieval_hook(
+      data_sharding=dataset_element_shape_dtype['image'].sharding,
+      base_model_config=config.model.copy_and_resolve_references(),
+      writer=writer,
+      progress_hook=progress_hook,
+      first_step=init_step + 1,
+      train_steps=train_steps,
+      **config.get('retrieval', {}))
+  zeroshot_hook = create_zeroshot_hook(
+      data_sharding=dataset_element_shape_dtype['image'].sharding,
+      base_model_config=config.model.copy_and_resolve_references(),
+      writer=writer,
+      progress_hook=progress_hook,
+      first_step=init_step + 1,
+      train_steps=train_steps,
+      **config.get('zeroshot', {}))
+  # Run checkpoint hook just before starting the loop. This will save the train
+  # state at initialization.
+  def _save_checkpoint(step, ts, it, force=False):
+    last_seen_index = step * train_batch_size
+    with progress_hook.timed('ckpt', wait_jax_async_dispatch=False):
+      ckpt_manager.save(
+          step,
+          items={
+              'state': ts,
+              'dataset_iterator': {'last_seen_index': last_seen_index},
+          },
+          force=force)
+  if init_step == 0 and not tf.io.gfile.exists(os.path.join(workdir, 'ckpt/0')):
+    multihost_utils.sync_devices('training:ckpt-first')
+    _save_checkpoint(init_step, train_state, tr_iter, force=True)
+  # Explicitly compile train_step here and report the compilation time.
+  t0 = time.time()
+  train_step_pjit = train_step_pjit.lower(
+      train_state,
+      dataset_element_shape_dtype['image'],
+      dataset_element_shape_dtype['text']).compile()
+  t1 = time.time()
+  # Report compilation time, and flops and optimal seconds per step and device.
+  writer.write_scalars(init_step + 1, {'train/compile_secs': t1 - t0})
+  train_step_flops_per_device, train_step_seconds_per_device = (
+      utils.get_flops_and_seconds_per_device(train_step_pjit))
+  if train_step_flops_per_device:
+    writer.write_scalars(
+        init_step + 1,
+        {'train/step_flops_per_device': train_step_flops_per_device})
+  if train_step_seconds_per_device:
+    writer.write_scalars(
+        init_step + 1,
+        {'train/step_seconds_per_device': train_step_seconds_per_device})
+  train_cost_fn = make_train_cost_fn(train_step_pjit)
+  for step, batch in zip(range(init_step + 1, train_steps + 1), tr_iter):
+    profile_hook(step)
+    with jax.profiler.StepTraceAnnotation('train', step_num=step):
+      train_state, metrics = train_step_pjit(train_state, batch['image'],
+                                             batch['text'])
+    progress_hook(step, scalar_metrics=(
+        train_cost_fn(step) | {f'train/{k}': v for k, v in metrics.items()}
+    ))
+    _save_checkpoint(step, train_state, tr_iter)
+    fewshot_hook(step, variables={'params': train_state.params},
+                 **train_cost_fn(step))
+    retrieval_hook(step, variables={'params': train_state.params},
+                   **train_cost_fn(step))
+    zeroshot_hook(step, variables={'params': train_state.params},
+                  **train_cost_fn(step))
+  ckpt_manager.wait_until_finished()
+  if not tf.io.gfile.exists(os.path.join(workdir, f'ckpt/{train_steps}')):
+    multihost_utils.sync_devices('training:ckpt-last')
+    _save_checkpoint(train_steps, train_state, tr_iter, force=True)
+    ckpt_manager.wait_until_finished()
+  multihost_utils.sync_devices('training:completed')
+  logging.info('Training completed.')
diff --git a/vmoe/projects/soft_moe/README.md b/vmoe/projects/soft_moe/README.md
index 2bc6aaf..a5d624a 100644
--- a/vmoe/projects/soft_moe/README.md
+++ b/vmoe/projects/soft_moe/README.md
@@ -8,5 +8,12 @@ This folder contains the implementation of Soft MoE, presented in the paper:
 We provide the config files used to run some of the experiments reported in the
 paper.
 
-Notice that all experiments either train on JFT-4B, a proprietary dataset,
+Notice that most experiments either train on JFT-4B, a proprietary dataset,
 or use models pre-trained on it, thus we cannot release any of the checkpoints.
+We have released the config file used to train on JFT-4B from scratch, for
+reference.
+
+We have also included a config file to pretrain on LAION-400M, which is a
+publicly available dataset. This can be used replicate the experiments that we
+conducted on this dataset and are reported in the paper. Note, however, that we
+are not planning on releasing any checkpoint trained in this dataset.
diff --git a/vmoe/projects/soft_moe/configs/pretrain_laion.py b/vmoe/projects/soft_moe/configs/pretrain_laion.py
new file mode 100644
index 0000000..bc3c643
--- /dev/null
+++ b/vmoe/projects/soft_moe/configs/pretrain_laion.py
@@ -0,0 +1,188 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=line-too-long
+r"""Train different models used in the Soft MoE paper on the LAION dataset.
+
+"""
+# pylint: enable=line-too-long
+import ml_collections
+from vmoe.projects.soft_moe.configs import common
+
+BATCH_SIZE = 16_384
+DATASET = 'laion400m'
+
+
+def get_default_moe_num_experts_and_last_n(variant, patch_size):
+  """Default number of experts and MoE layers for Sparse and Soft MoEs."""
+  num_experts = {
+      ('S', 16): 128,
+      ('S', 14): 256,
+      ('B', 16): 128,
+      ('L', 16): 128,
+      ('H', 14): 256,
+  }[(variant, patch_size)]
+  last_n = {
+      'S': 6,
+      'B': 6,
+      'L': 12,
+      'H': 16,
+  }[variant]
+  return num_experts, last_n
+
+
+# pylint: disable=line-too-long
+def tokenize(inkey: str, outkey: str = 'text') -> str:
+  return f'tokenize(max_len=16, model="c4_en", eos="sticky", inkey="{inkey}", outkey="{outkey}", pad_value=1)'
+# pylint: enable=line-too-long
+
+
+def get_config(model='soft-s16') -> ml_collections.ConfigDict:
+  """Config to train different models used in the Soft MoE paper."""
+  # Parse model argument.
+  model_type, model_backbone = model.split('-')
+  patch_size = int(model_backbone[1:])
+  variant = model_backbone[0].upper()
+
+  # SoftMoEs highly benefit from data augmentation, while ViTs and MoEs with
+  # Experts Choice routing actually do worse. See Figure 15 in the paper.
+  #
+  if model_type in ('vit', 'ec'):
+    process_str = 'decode|resize(256)'
+  else:
+    process_str = 'decode_jpeg_and_inception_crop(256)'
+
+  config = common.get_base_config()
+  config.dataset = ml_collections.ConfigDict()
+  config.dataset.train = common.get_data_config(
+      name=DATASET,
+      split='full[16384:]',
+      batch_size=BATCH_SIZE,
+      process=(
+          f'{process_str}|value_range(-1,1)|flatten|'
+          f'{tokenize("text")}|keep("image", "text")'
+      ),
+      shuffle_buffer=250_000,
+  )
+  config.fewshot = common.get_fewshot_config(
+      batch_size=1_024, resize_resolution=292, target_resolution=256,
+      every_steps=10_000, seeds_per_step=3)
+  config.fewshot.model_overrides = ml_collections.ConfigDict()
+  config.retrieval = ml_collections.ConfigDict({
+      'batch_size': 1_024,
+      'every_steps': 10_000,
+      'datasets': {
+          'coco': {
+              'dataset': 'coco_captions',
+              'txt_name': ('captions', 'text'),
+              'pp_img': 'resize(256)|value_range(-1, 1)',
+              'pp_txt': f'{tokenize(inkey="texts", outkey="labels")}',
+          },
+          'flickr': {
+              'dataset': 'argus:flickr30k/captions',
+              'txt_name': 'texts',
+              'pp_img': 'resize(256)|value_range(-1, 1)',
+              'pp_txt': f'{tokenize(inkey="texts", outkey="labels")}',
+          },
+      }
+  })
+  config.zeroshot = ml_collections.ConfigDict({
+      'batch_size': 1_024,
+      'every_steps': 10_000,
+      'pp_img': 'resize(256)|value_range(-1, 1)',
+      'pp_txt': f'{tokenize(inkey="texts", outkey="labels")}',
+      'datasets': {
+          'cifar100': {},
+          'imagenet2012': {'class_names': 'clip', 'split': 'validation'},
+          'oxford_iiit_pet': {},
+      },
+  })
+
+  # Optimizer configuration.
+  config.optimizer = common.get_optimizer_rsqrt_config()
+  config.optimizer.weight_decay = (('.*/kernel', 0.1),)
+  config.optimizer.learning_rate.warmup_steps = 20_000
+  config.optimizer.learning_rate.cooldown_steps = 20_000
+  config.train_steps = 750_000
+
+  config.model = ml_collections.ConfigDict({
+      'name': 'vmoe.projects.contrastive.models.TwoTower',
+      'bias_init': -10.0,
+      'scale_init': 10.0,
+  })
+
+  # Image encoder hyperparameters depend on the model type.
+  if model_type == 'vit':
+    config.model.image = common.get_vit_config(variant, patch_size, None)
+  elif model_type == 'ec':
+    num_experts, last_n = get_default_moe_num_experts_and_last_n(
+        variant, patch_size)
+    config.model.image = common.get_vmoe_experts_choose_config(
+        variant, patch_size, None, image_size=256,
+        num_experts=num_experts, last_n=last_n, capacity_factor=1.0)
+  elif model_type == 'soft':
+    num_experts, last_n = get_default_moe_num_experts_and_last_n(
+        variant, patch_size)
+    config.model.image = common.get_vmoe_soft_router_config(
+        variant, patch_size, None, image_size=256,
+        num_experts=num_experts, last_n=last_n, capacity_factor=None,
+        num_slots=1)
+    config.model.image.encoder.moe.router.compute_similarity_metrics = False
+  else:
+    raise ValueError(f'Unknown model type: {model_type!r}')
+
+  # Text encoder is a B size model.
+  config.model.text = ml_collections.ConfigDict({
+      'vocab_size': 32_000,
+      'num_classes': config.model.image.hidden_size,
+      'hidden_size': 768,
+      'encoder': {
+          'num_layers': 12,
+          'mlp_dim': 3072,
+          'num_heads': 12,
+          'dropout_rate': 0.0,
+          'attention_dropout_rate': 0.0,
+          'attention_qk_norm': True,
+          'moe': {'layers': ()},
+      }
+  })
+
+  # These control how the train state is partitioned across the device mesh.
+  if model_type == 'vit':
+    config.num_expert_partitions = 1
+    config.params_axis_resources = []
+  else:
+    config.num_expert_partitions = config.model.image.encoder.moe.num_experts
+    config.params_axis_resources = [('Moe/Mlp/.*', ('expert',))]
+  config.extra_rng_keys = ('dropout', 'gating')
+  # Plot summary of different arrays.
+  config.summarize_arrays = ml_collections.ConfigDict({
+      'rules': [
+          'opt_state/.*/hyperparams/learning_rate',  # Learning rate.
+          'params/.*/Moe/Router/scale',              # Soft MoE scale.
+      ],
+      # Maximum values reported per rule and array.
+      # If you are reporting individual values for every expert parameter,
+      # increase this accordingly.
+      'max_summary_values': 1,
+  })
+  # Keep checkpoints every 50k steps, useful to do intermediate cooldowns.
+  config.save_checkpoint.keep_last = 2
+  config.save_checkpoint.keep_steps_multiple_of = 50_000
+  return config
+
+
+def get_hyper(hyper, model='soft-s16'):
+  del model
+  return hyper.product([])
diff --git a/vmoe/projects/soft_moe/main_contrastive.py b/vmoe/projects/soft_moe/main_contrastive.py
new file mode 100644
index 0000000..beffc62
--- /dev/null
+++ b/vmoe/projects/soft_moe/main_contrastive.py
@@ -0,0 +1,20 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Main training script for Soft MoE experiments."""
+from vmoe import app
+from vmoe.projects.contrastive import trainer
+
+if __name__ == '__main__':
+  app.run(trainer.train_and_evaluate)