marin-community · gonzalobenegas · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/experiments/defaults.py b/experiments/defaults.py
@@ -128,6 +128,7 @@ def default_tokenize(
     *,
     sample_count: int | VersionedValue[int] | None = None,
     is_validation: bool = False,
+    window_size_bytes: int = 10_000_000_000,
 ) -> ExecutorStep:
     """
     Tokenizes a dataset using the specified tokenizer and Levanter's tokenization infrastructure.
@@ -146,6 +147,8 @@ def default_tokenize(
             for more details.
         sample_count: Optional limit on the number of samples to tokenize per shard. If ``None``, tokenize everything.
         is_validation: Whether the dataset is a validation set. Doesn't do anything for HF datasets.
+        window_size_bytes: Maximum size in bytes for bundling files into processing groups. Smaller values
+            increase parallelism (more workers), larger values reduce overhead. Default is 10GB.
     Returns:
         An ExecutorStep that represents the tokenized dataset.
     """
@@ -159,6 +162,7 @@ def default_tokenize(
             tokenizer=ensure_versioned(tokenizer),
             format=format,
             sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
+            window_size_bytes=window_size_bytes,
         )
     elif isinstance(dataset, str) and dataset.count("/") == 1 and not fsspec_utils.exists(dataset):
         config = HfTokenizeConfig(
@@ -167,6 +171,7 @@ def default_tokenize(
             tokenizer=ensure_versioned(tokenizer),
             format=format,
             sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
+            window_size_bytes=window_size_bytes,
         )
     else:
         config = TokenizeConfig(
@@ -176,6 +181,7 @@ def default_tokenize(
             tokenizer=ensure_versioned(tokenizer),
             format=format,
             sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
+            window_size_bytes=window_size_bytes,
         )
 
     return ExecutorStep(

diff --git a/experiments/dna/repeat_weight_0.01.py b/experiments/dna/repeat_weight_0.01.py
@@ -0,0 +1,111 @@
+# Copyright 2025 The Marin Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+DNA training experiment with strong repeat downweighting (soft_mask_weight=0.01).
+
+Uses DNALmDatasetFormat to apply 0.01 loss weight to soft-masked (lowercase) positions.
+"""
+
+import dataclasses
+import logging
+from fray.cluster import ResourceConfig
+from levanter.data.text import DNALmDatasetFormat
+from experiments.qwen3 import qwen3_0_6b_hd128
+from marin.execution.executor import executor_main
+from experiments.defaults import default_tokenize, default_train
+from experiments.simple_train_config import SimpleTrainConfig
+
+logger = logging.getLogger("ray")
+
+RESOURCES = ResourceConfig.with_tpu("v5p-8")
+
+# -----------------------------------------------------------------------------
+# Experiment configuration
+# -----------------------------------------------------------------------------
+run_number = 1
+tokenizer_path = "songlab/tokenizer-dna-clm"
+dataset_path = "gonzalobenegas/genomes-v3-genome_set-animals-intervals-v1_512_256"
+dataset_seq_len = 512  # constant for all sequences in dataset
+learning_rate = 1e-3
+train_batch_size = 2048
+lr_schedule = "inv"
+num_train_steps = 20_000
+steps_per_export = 2000
+steps_per_cycle = steps_per_export
+steps_per_eval = steps_per_export
+warmup = 0.5  # fraction of cycle
+decay = 0.1
+
+# -----------------------------------------------------------------------------
+# Model configuration
+# -----------------------------------------------------------------------------
+model_config = dataclasses.replace(qwen3_0_6b_hd128, max_seq_len=dataset_seq_len)
+
+# -----------------------------------------------------------------------------
+# Dataset configuration
+# -----------------------------------------------------------------------------
+data_tokenized = default_tokenize(
+    name="animal-promoters-repeat-weight-0.01",
+    dataset=dataset_path,
+    tokenizer=tokenizer_path,
+    format=DNALmDatasetFormat(soft_mask_weight=0.01),
+    # my thoughts (should check):
+    # max parallelism is number of shards in HF dataset
+    # window_size_bytes should be smaller than shard size to achieve max parallelism
+    window_size_bytes=50_000_000,
+)
+
+# -----------------------------------------------------------------------------
+# Training configuration
+# -----------------------------------------------------------------------------
+train_config = SimpleTrainConfig(
+    resources=RESOURCES,
+    train_batch_size=train_batch_size,
+    learning_rate=learning_rate,
+    lr_schedule=lr_schedule,
+    warmup=warmup,
+    decay=decay,
+    cycle_length=steps_per_cycle,
+    steps_per_eval=steps_per_eval,
+    num_train_steps=num_train_steps,
+    steps_per_export=steps_per_export,
+    data_seed=42,
+)
+
+training_step = default_train(
+    name=f"animal-promoters-repeat-weight-0.01-r{run_number:02d}",
+    tokenized=data_tokenized,
+    model_config=model_config,
+    train_config=train_config,
+    tags=["dna", "animal-promoters"],
+    eval_harness_tasks=[],
+    use_default_validation=False,
+)
+
+# -----------------------------------------------------------------------------
+# Main
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    logger.info("🧬 DNA Training Experiment")
+    logger.info("=" * 64)
+    logger.info(f"Model:              {model_config}")
+    logger.info(f"Learning rate:      {learning_rate}")
+    logger.info(f"Global batch size:  {train_batch_size}")
+    logger.info(f"Training steps:     {num_train_steps:,}")
+    logger.info(f"Steps per export:   {steps_per_export:,}")
+    logger.info(f"Steps per eval:     {steps_per_eval:,}")
+    logger.info("=" * 64)
+
+    executor_main(steps=[training_step])
diff --git a/experiments/dna/repeat_weight_1.0.py b/experiments/dna/repeat_weight_1.0.py
@@ -0,0 +1,111 @@
+# Copyright 2025 The Marin Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+DNA training experiment with no repeat downweighting (soft_mask_weight=1.0).
+
+Uses DNALmDatasetFormat but with uniform loss weights (control experiment).
+"""
+
+import dataclasses
+import logging
+from fray.cluster import ResourceConfig
+from levanter.data.text import DNALmDatasetFormat
+from experiments.qwen3 import qwen3_0_6b_hd128
+from marin.execution.executor import executor_main
+from experiments.defaults import default_tokenize, default_train
+from experiments.simple_train_config import SimpleTrainConfig
+
+logger = logging.getLogger("ray")
+
+RESOURCES = ResourceConfig.with_tpu("v5p-8")
+
+# -----------------------------------------------------------------------------
+# Experiment configuration
+# -----------------------------------------------------------------------------
+run_number = 1
+tokenizer_path = "songlab/tokenizer-dna-clm"
+dataset_path = "gonzalobenegas/genomes-v3-genome_set-animals-intervals-v1_512_256"
+dataset_seq_len = 512  # constant for all sequences in dataset
+learning_rate = 1e-3
+train_batch_size = 2048
+lr_schedule = "inv"
+num_train_steps = 20_000
+steps_per_export = 2000
+steps_per_cycle = steps_per_export
+steps_per_eval = steps_per_export
+warmup = 0.5  # fraction of cycle
+decay = 0.1
+
+# -----------------------------------------------------------------------------
+# Model configuration
+# -----------------------------------------------------------------------------
+model_config = dataclasses.replace(qwen3_0_6b_hd128, max_seq_len=dataset_seq_len)
+
+# -----------------------------------------------------------------------------
+# Dataset configuration
+# -----------------------------------------------------------------------------
+data_tokenized = default_tokenize(
+    name="animal-promoters-repeat-weight-1.0",
+    dataset=dataset_path,
+    tokenizer=tokenizer_path,
+    format=DNALmDatasetFormat(soft_mask_weight=1.0),
+    # my thoughts (should check):
+    # max parallelism is number of shards in HF dataset
+    # window_size_bytes should be smaller than shard size to achieve max parallelism
+    window_size_bytes=50_000_000,
+)
+
+# -----------------------------------------------------------------------------
+# Training configuration
+# -----------------------------------------------------------------------------
+train_config = SimpleTrainConfig(
+    resources=RESOURCES,
+    train_batch_size=train_batch_size,
+    learning_rate=learning_rate,
+    lr_schedule=lr_schedule,
+    warmup=warmup,
+    decay=decay,
+    cycle_length=steps_per_cycle,
+    steps_per_eval=steps_per_eval,
+    num_train_steps=num_train_steps,
+    steps_per_export=steps_per_export,
+    data_seed=42,
+)
+
+training_step = default_train(
+    name=f"animal-promoters-repeat-weight-1.0-r{run_number:02d}",
+    tokenized=data_tokenized,
+    model_config=model_config,
+    train_config=train_config,
+    tags=["dna", "animal-promoters"],
+    eval_harness_tasks=[],
+    use_default_validation=False,
+)
+
+# -----------------------------------------------------------------------------
+# Main
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    logger.info("🧬 DNA Training Experiment")
+    logger.info("=" * 64)
+    logger.info(f"Model:              {model_config}")
+    logger.info(f"Learning rate:      {learning_rate}")
+    logger.info(f"Global batch size:  {train_batch_size}")
+    logger.info(f"Training steps:     {num_train_steps:,}")
+    logger.info(f"Steps per export:   {steps_per_export:,}")
+    logger.info(f"Steps per eval:     {steps_per_eval:,}")
+    logger.info("=" * 64)
+
+    executor_main(steps=[training_step])
diff --git a/experiments/dna/standard.py b/experiments/dna/standard.py
@@ -0,0 +1,111 @@
+# Copyright 2025 The Marin Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Standard DNA training experiment without repeat downweighting.
+
+Uses TextLmDatasetFormat with uniform loss weights across all positions.
+"""
+
+import dataclasses
+import logging
+from fray.cluster import ResourceConfig
+from levanter.data.text import TextLmDatasetFormat
+from experiments.qwen3 import qwen3_0_6b_hd128
+from marin.execution.executor import executor_main
+from experiments.defaults import default_tokenize, default_train
+from experiments.simple_train_config import SimpleTrainConfig
+
+logger = logging.getLogger("ray")
+
+RESOURCES = ResourceConfig.with_tpu("v5p-8")
+
+# -----------------------------------------------------------------------------
+# Experiment configuration
+# -----------------------------------------------------------------------------
+run_number = 8
+tokenizer_path = "songlab/tokenizer-dna-clm"
+dataset_path = "gonzalobenegas/genomes-v3-genome_set-animals-intervals-v1_512_256"
+dataset_seq_len = 512  # constant for all sequences in dataset
+learning_rate = 1e-3
+train_batch_size = 2048
+lr_schedule = "inv"
+num_train_steps = 20_000
+steps_per_export = 2000
+steps_per_cycle = steps_per_export
+steps_per_eval = steps_per_export
+warmup = 0.5  # fraction of cycle
+decay = 0.1
+
+# -----------------------------------------------------------------------------
+# Model configuration
+# -----------------------------------------------------------------------------
+model_config = dataclasses.replace(qwen3_0_6b_hd128, max_seq_len=dataset_seq_len)
+
+# -----------------------------------------------------------------------------
+# Dataset configuration
+# -----------------------------------------------------------------------------
+data_tokenized = default_tokenize(
+    name="animal-promoters-standard",
+    dataset=dataset_path,
+    tokenizer=tokenizer_path,
+    format=TextLmDatasetFormat(text_key="seq"),
+    # my thoughts (should check):
+    # max parallelism is number of shards in HF dataset
+    # window_size_bytes should be smaller than shard size to achieve max parallelism
+    window_size_bytes=50_000_000,
+)
+
+# -----------------------------------------------------------------------------
+# Training configuration
+# -----------------------------------------------------------------------------
+train_config = SimpleTrainConfig(
+    resources=RESOURCES,
+    train_batch_size=train_batch_size,
+    learning_rate=learning_rate,
+    lr_schedule=lr_schedule,
+    warmup=warmup,
+    decay=decay,
+    cycle_length=steps_per_cycle,
+    steps_per_eval=steps_per_eval,
+    num_train_steps=num_train_steps,
+    steps_per_export=steps_per_export,
+    data_seed=42,
+)
+
+training_step = default_train(
+    name=f"animal-promoters-standard-r{run_number:02d}",
+    tokenized=data_tokenized,
+    model_config=model_config,
+    train_config=train_config,
+    tags=["dna", "animal-promoters"],
+    eval_harness_tasks=[],
+    use_default_validation=False,
+)
+
+# -----------------------------------------------------------------------------
+# Main
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    logger.info("🧬 DNA Training Experiment")
+    logger.info("=" * 64)
+    logger.info(f"Model:              {model_config}")
+    logger.info(f"Learning rate:      {learning_rate}")
+    logger.info(f"Global batch size:  {train_batch_size}")
+    logger.info(f"Training steps:     {num_train_steps:,}")
+    logger.info(f"Steps per export:   {steps_per_export:,}")
+    logger.info(f"Steps per eval:     {steps_per_eval:,}")
+    logger.info("=" * 64)
+
+    executor_main(steps=[training_step])