Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions experiments/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def default_tokenize(
*,
sample_count: int | VersionedValue[int] | None = None,
is_validation: bool = False,
window_size_bytes: int = 10_000_000_000,
) -> ExecutorStep:
"""
Tokenizes a dataset using the specified tokenizer and Levanter's tokenization infrastructure.
Expand All @@ -146,6 +147,8 @@ def default_tokenize(
for more details.
sample_count: Optional limit on the number of samples to tokenize per shard. If ``None``, tokenize everything.
is_validation: Whether the dataset is a validation set. Doesn't do anything for HF datasets.
window_size_bytes: Maximum size in bytes for bundling files into processing groups. Smaller values
increase parallelism (more workers), larger values reduce overhead. Default is 10GB.
Returns:
An ExecutorStep that represents the tokenized dataset.
"""
Expand All @@ -159,6 +162,7 @@ def default_tokenize(
tokenizer=ensure_versioned(tokenizer),
format=format,
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
window_size_bytes=window_size_bytes,
)
elif isinstance(dataset, str) and dataset.count("/") == 1 and not fsspec_utils.exists(dataset):
config = HfTokenizeConfig(
Expand All @@ -167,6 +171,7 @@ def default_tokenize(
tokenizer=ensure_versioned(tokenizer),
format=format,
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
window_size_bytes=window_size_bytes,
)
else:
config = TokenizeConfig(
Expand All @@ -176,6 +181,7 @@ def default_tokenize(
tokenizer=ensure_versioned(tokenizer),
format=format,
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
window_size_bytes=window_size_bytes,
)

return ExecutorStep(
Expand Down
111 changes: 111 additions & 0 deletions experiments/dna/repeat_weight_0.01.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2025 The Marin Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
DNA training experiment with strong repeat downweighting (soft_mask_weight=0.01).

Uses DNALmDatasetFormat to apply 0.01 loss weight to soft-masked (lowercase) positions.
"""

import dataclasses
import logging
from fray.cluster import ResourceConfig
from levanter.data.text import DNALmDatasetFormat
from experiments.qwen3 import qwen3_0_6b_hd128
from marin.execution.executor import executor_main
from experiments.defaults import default_tokenize, default_train
from experiments.simple_train_config import SimpleTrainConfig

logger = logging.getLogger("ray")

RESOURCES = ResourceConfig.with_tpu("v5p-8")

# -----------------------------------------------------------------------------
# Experiment configuration
# -----------------------------------------------------------------------------
run_number = 1
tokenizer_path = "songlab/tokenizer-dna-clm"
dataset_path = "gonzalobenegas/genomes-v3-genome_set-animals-intervals-v1_512_256"
dataset_seq_len = 512 # constant for all sequences in dataset
learning_rate = 1e-3
train_batch_size = 2048
lr_schedule = "inv"
num_train_steps = 20_000
steps_per_export = 2000
steps_per_cycle = steps_per_export
steps_per_eval = steps_per_export
warmup = 0.5 # fraction of cycle
decay = 0.1

# -----------------------------------------------------------------------------
# Model configuration
# -----------------------------------------------------------------------------
model_config = dataclasses.replace(qwen3_0_6b_hd128, max_seq_len=dataset_seq_len)

# -----------------------------------------------------------------------------
# Dataset configuration
# -----------------------------------------------------------------------------
data_tokenized = default_tokenize(
name="animal-promoters-repeat-weight-0.01",
dataset=dataset_path,
tokenizer=tokenizer_path,
format=DNALmDatasetFormat(soft_mask_weight=0.01),
# my thoughts (should check):
# max parallelism is number of shards in HF dataset
# window_size_bytes should be smaller than shard size to achieve max parallelism
window_size_bytes=50_000_000,
)

# -----------------------------------------------------------------------------
# Training configuration
# -----------------------------------------------------------------------------
train_config = SimpleTrainConfig(
resources=RESOURCES,
train_batch_size=train_batch_size,
learning_rate=learning_rate,
lr_schedule=lr_schedule,
warmup=warmup,
decay=decay,
cycle_length=steps_per_cycle,
steps_per_eval=steps_per_eval,
num_train_steps=num_train_steps,
steps_per_export=steps_per_export,
data_seed=42,
)

training_step = default_train(
name=f"animal-promoters-repeat-weight-0.01-r{run_number:02d}",
tokenized=data_tokenized,
model_config=model_config,
train_config=train_config,
tags=["dna", "animal-promoters"],
eval_harness_tasks=[],
use_default_validation=False,
)

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
logger.info("🧬 DNA Training Experiment")
logger.info("=" * 64)
logger.info(f"Model: {model_config}")
logger.info(f"Learning rate: {learning_rate}")
logger.info(f"Global batch size: {train_batch_size}")
logger.info(f"Training steps: {num_train_steps:,}")
logger.info(f"Steps per export: {steps_per_export:,}")
logger.info(f"Steps per eval: {steps_per_eval:,}")
logger.info("=" * 64)

executor_main(steps=[training_step])
111 changes: 111 additions & 0 deletions experiments/dna/repeat_weight_1.0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2025 The Marin Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
DNA training experiment with no repeat downweighting (soft_mask_weight=1.0).

Uses DNALmDatasetFormat but with uniform loss weights (control experiment).
"""

import dataclasses
import logging
from fray.cluster import ResourceConfig
from levanter.data.text import DNALmDatasetFormat
from experiments.qwen3 import qwen3_0_6b_hd128
from marin.execution.executor import executor_main
from experiments.defaults import default_tokenize, default_train
from experiments.simple_train_config import SimpleTrainConfig

logger = logging.getLogger("ray")

RESOURCES = ResourceConfig.with_tpu("v5p-8")

# -----------------------------------------------------------------------------
# Experiment configuration
# -----------------------------------------------------------------------------
run_number = 1
tokenizer_path = "songlab/tokenizer-dna-clm"
dataset_path = "gonzalobenegas/genomes-v3-genome_set-animals-intervals-v1_512_256"
dataset_seq_len = 512 # constant for all sequences in dataset
learning_rate = 1e-3
train_batch_size = 2048
lr_schedule = "inv"
num_train_steps = 20_000
steps_per_export = 2000
steps_per_cycle = steps_per_export
steps_per_eval = steps_per_export
warmup = 0.5 # fraction of cycle
decay = 0.1

# -----------------------------------------------------------------------------
# Model configuration
# -----------------------------------------------------------------------------
model_config = dataclasses.replace(qwen3_0_6b_hd128, max_seq_len=dataset_seq_len)

# -----------------------------------------------------------------------------
# Dataset configuration
# -----------------------------------------------------------------------------
data_tokenized = default_tokenize(
name="animal-promoters-repeat-weight-1.0",
dataset=dataset_path,
tokenizer=tokenizer_path,
format=DNALmDatasetFormat(soft_mask_weight=1.0),
# my thoughts (should check):
# max parallelism is number of shards in HF dataset
# window_size_bytes should be smaller than shard size to achieve max parallelism
window_size_bytes=50_000_000,
)

# -----------------------------------------------------------------------------
# Training configuration
# -----------------------------------------------------------------------------
train_config = SimpleTrainConfig(
resources=RESOURCES,
train_batch_size=train_batch_size,
learning_rate=learning_rate,
lr_schedule=lr_schedule,
warmup=warmup,
decay=decay,
cycle_length=steps_per_cycle,
steps_per_eval=steps_per_eval,
num_train_steps=num_train_steps,
steps_per_export=steps_per_export,
data_seed=42,
)

training_step = default_train(
name=f"animal-promoters-repeat-weight-1.0-r{run_number:02d}",
tokenized=data_tokenized,
model_config=model_config,
train_config=train_config,
tags=["dna", "animal-promoters"],
eval_harness_tasks=[],
use_default_validation=False,
)

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
logger.info("🧬 DNA Training Experiment")
logger.info("=" * 64)
logger.info(f"Model: {model_config}")
logger.info(f"Learning rate: {learning_rate}")
logger.info(f"Global batch size: {train_batch_size}")
logger.info(f"Training steps: {num_train_steps:,}")
logger.info(f"Steps per export: {steps_per_export:,}")
logger.info(f"Steps per eval: {steps_per_eval:,}")
logger.info("=" * 64)

executor_main(steps=[training_step])
111 changes: 111 additions & 0 deletions experiments/dna/standard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright 2025 The Marin Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Standard DNA training experiment without repeat downweighting.

Uses TextLmDatasetFormat with uniform loss weights across all positions.
"""

import dataclasses
import logging
from fray.cluster import ResourceConfig
from levanter.data.text import TextLmDatasetFormat
from experiments.qwen3 import qwen3_0_6b_hd128
from marin.execution.executor import executor_main
from experiments.defaults import default_tokenize, default_train
from experiments.simple_train_config import SimpleTrainConfig

logger = logging.getLogger("ray")

RESOURCES = ResourceConfig.with_tpu("v5p-8")

# -----------------------------------------------------------------------------
# Experiment configuration
# -----------------------------------------------------------------------------
run_number = 8
tokenizer_path = "songlab/tokenizer-dna-clm"
dataset_path = "gonzalobenegas/genomes-v3-genome_set-animals-intervals-v1_512_256"
dataset_seq_len = 512 # constant for all sequences in dataset
learning_rate = 1e-3
train_batch_size = 2048
lr_schedule = "inv"
num_train_steps = 20_000
steps_per_export = 2000
steps_per_cycle = steps_per_export
steps_per_eval = steps_per_export
warmup = 0.5 # fraction of cycle
decay = 0.1

# -----------------------------------------------------------------------------
# Model configuration
# -----------------------------------------------------------------------------
model_config = dataclasses.replace(qwen3_0_6b_hd128, max_seq_len=dataset_seq_len)

# -----------------------------------------------------------------------------
# Dataset configuration
# -----------------------------------------------------------------------------
data_tokenized = default_tokenize(
name="animal-promoters-standard",
dataset=dataset_path,
tokenizer=tokenizer_path,
format=TextLmDatasetFormat(text_key="seq"),
# my thoughts (should check):
# max parallelism is number of shards in HF dataset
# window_size_bytes should be smaller than shard size to achieve max parallelism
window_size_bytes=50_000_000,
)

# -----------------------------------------------------------------------------
# Training configuration
# -----------------------------------------------------------------------------
train_config = SimpleTrainConfig(
resources=RESOURCES,
train_batch_size=train_batch_size,
learning_rate=learning_rate,
lr_schedule=lr_schedule,
warmup=warmup,
decay=decay,
cycle_length=steps_per_cycle,
steps_per_eval=steps_per_eval,
num_train_steps=num_train_steps,
steps_per_export=steps_per_export,
data_seed=42,
)

training_step = default_train(
name=f"animal-promoters-standard-r{run_number:02d}",
tokenized=data_tokenized,
model_config=model_config,
train_config=train_config,
tags=["dna", "animal-promoters"],
eval_harness_tasks=[],
use_default_validation=False,
)

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
logger.info("🧬 DNA Training Experiment")
logger.info("=" * 64)
logger.info(f"Model: {model_config}")
logger.info(f"Learning rate: {learning_rate}")
logger.info(f"Global batch size: {train_batch_size}")
logger.info(f"Training steps: {num_train_steps:,}")
logger.info(f"Steps per export: {steps_per_export:,}")
logger.info(f"Steps per eval: {steps_per_eval:,}")
logger.info("=" * 64)

executor_main(steps=[training_step])
Loading