diff --git a/.gitignore b/.gitignore
index 2bd422eca7..4f351e98f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -223,3 +223,4 @@ gha-creds-*.json
 *.jsonl
 **/*.jsonl
 scr/*
+output
\ No newline at end of file
diff --git a/lib/levanter/scripts/launch_vlm_training.py b/lib/levanter/scripts/launch_vlm_training.py
new file mode 100644
index 0000000000..ddf7ac3159
--- /dev/null
+++ b/lib/levanter/scripts/launch_vlm_training.py
@@ -0,0 +1,576 @@
+#!/usr/bin/env python3
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Launch script for VLM (Vision-Language Model) training with LLaVA OneVision.
+
+This script provides a complete training pipeline for LLaVA OneVision models
+using real parquet data, with performance optimizations for TPU/GPU training.
+
+Usage:
+    # Train from scratch with small model config
+    python launch_vlm_training.py
+
+    # Train with HuggingFace pretrained weights
+    python launch_vlm_training.py --initialize_from_hf
+
+    # Train with a single parquet file
+    python launch_vlm_training.py --train_data /path/to/train.parquet --val_data /path/to/val.parquet
+
+    # Train with a folder containing multiple parquet files
+    python launch_vlm_training.py --train_data /path/to/train_folder/ --val_data /path/to/val_folder/
+
+    # Train with glob pattern
+    python launch_vlm_training.py --train_data "/path/to/data/*.parquet"
+
+    # Full training run
+    python launch_vlm_training.py --initialize_from_hf --num_train_steps 10000 --train_batch_size 32
+
+    # High-performance training with all speed optimizations enabled
+    python launch_vlm_training.py --initialize_from_hf --mp bfloat16 \\
+        --freeze_vision_encoder --per_device_parallelism 8
+
+Performance Optimization Flags:
+    --freeze_vision_encoder : Freeze vision encoder (only train projector + LLM)
+    --per_device_parallelism: Number of examples per device (for gradient accumulation)
+    --fsdp_axis             : FSDP sharding axis (default: embed)
+"""
+
+import argparse
+import asyncio
+import dataclasses
+import logging
+
+import jmp  # For mixed precision policy
+
+import levanter.main.train_vlm as train_vlm
+from levanter.data.image import ConversationDatasetSourceConfig, ImageMixtureDatasetConfig
+from levanter.distributed import DistributedConfig, RayConfig
+from levanter.models.llava_onevision import LlavaOnevisionConfig
+from levanter.models.siglip import SiglipVisionConfig
+from levanter.models.qwen import Qwen3Config, QwenConfig
+from levanter.models.rotary import DefaultRotaryEmbeddingsConfig
+from levanter.layers.attention import AttentionBackend
+from levanter.optim import AdamConfig
+from levanter.tracker import NoopConfig
+from levanter.tracker.wandb import WandbConfig
+from levanter.checkpoint import CheckpointerConfig
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Launch VLM training with LLaVA OneVision")
+
+    # Data arguments
+    parser.add_argument(
+        "--train_data",
+        type=str,
+        default="./output",
+        help="Path to training data. Can be: a single parquet file, a directory containing parquet files, "
+        "or a glob pattern (e.g., '/path/to/*.parquet')",
+    )
+    parser.add_argument(
+        "--val_data",
+        type=str,
+        default=None,
+        help="Path to validation data. Same format as --train_data (defaults to train_data)",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default="/tmp/vlm_cache",
+        help="Directory for data caching",
+    )
+    parser.add_argument(
+        "--no_cache",
+        action="store_true",
+        help="Disable caching and use streaming mode (processes images on-the-fly, saves disk space)",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=8192,
+        help="Maximum sequence length",
+    )
+
+    # Model arguments
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+        help="HuggingFace model name for processor and optional weight initialization",
+    )
+    parser.add_argument(
+        "--initialize_from_hf",
+        action="store_true",  # Default is False; we use custom weight loading for SigLIP + Qwen3
+        help="Initialize model weights from HuggingFace checkpoint (for unified llava-onevision models)",
+    )
+    parser.add_argument(
+        "--use_hf_model_config",
+        action="store_true",  # Default is False; use custom SigLIP + Qwen3 config
+        help="Use model config from HuggingFace checkpoint (set to True to load full llava-onevision model)",
+    )
+    parser.add_argument(
+        "--use_small_model",
+        action="store_true",
+        help="Use small model config for testing (overrides --use_hf_model_config)",
+    )
+
+    # Training arguments
+    parser.add_argument(
+        "--num_train_steps",
+        type=int,
+        default=20000,
+        help="Number of training steps",
+    )
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=1,
+        help="Number of epochs to train (default: 1). If 0, train indefinitely until num_train_steps is reached. "
+        "If > 0, dataset will cycle through the data for the specified number of epochs.",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=8,
+        help="Training batch size",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Learning rate",
+    )
+    parser.add_argument(
+        "--weight_decay",
+        type=float,
+        default=0.0,
+        help="Weight decay",
+    )
+    parser.add_argument(
+        "--warmup_ratio",
+        type=float,
+        default=0.03,
+        help="Warmup ratio",
+    )
+
+    # === Performance Optimization Arguments ===
+    parser.add_argument(
+        "--mp",
+        type=str,
+        default="bfloat16",
+        choices=["bfloat16", "float16", "float32", None],
+        help="Mixed precision mode: bfloat16 (recommended for TPU), float16 (GPU), or float32 (full precision)",
+    )
+    parser.add_argument(
+        "--no_flash_attention",
+        action="store_true",
+        help="Disable flash attention (enabled by default for memory-efficient attention computation)",
+    )
+    parser.add_argument(
+        "--flash_attention_block_size",
+        type=int,
+        default=1024,
+        help="Block size for flash attention (default: 512, use smaller values if OOM)",
+    )
+    parser.add_argument(
+        "--per_device_parallelism",
+        type=int,
+        default=-1,
+        help="Number of examples to process per device. -1 means train_batch_size/num_devices. "
+        "Set lower for gradient accumulation to save memory.",
+    )
+    parser.add_argument(
+        "--freeze_vision_encoder",
+        action="store_true",
+        help="Freeze vision encoder weights (only train projector and LLM). "
+        "Reduces compute by ~30% and often improves fine-tuning results.",
+    )
+    parser.add_argument(
+        "--freeze_llm",
+        action="store_true",
+        help="Freeze LLM weights (only train projector and vision encoder). "
+        "Useful for vision encoder fine-tuning or projector-only training.",
+    )
+    parser.add_argument(
+        "--fsdp_axis",
+        type=str,
+        default="embed",
+        help="Axis to use for FSDP sharding. Options: embed, mlp, or comma-separated list",
+    )
+    parser.add_argument(
+        "--no_gradient_checkpointing",
+        action="store_true",
+        help="Disable gradient checkpointing (enabled by default to reduce memory usage)",
+    )
+
+    # Checkpoint arguments
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="/tmp/vlm_output",
+        help="Directory for saving checkpoints",
+    )
+    parser.add_argument(
+        "--hf_save_path",
+        type=str,
+        default=None,
+        help="Path to save HuggingFace format checkpoints",
+    )
+    parser.add_argument(
+        "--hf_save_steps",
+        type=int,
+        default=1000,
+        help="Save HF checkpoint every N steps",
+    )
+    parser.add_argument(
+        "--checkpointer_path",
+        type=str,
+        default=None,
+        help="Path for Levanter checkpoints (defaults to output_dir/checkpoints)",
+    )
+
+    # Logging arguments
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+        default="marin-vlm",
+        help="Weights & Biases project name (None to disable)",
+    )
+    parser.add_argument(
+        "--wandb_run_name",
+        type=str,
+        default=None,
+        help="Weights & Biases run name",
+    )
+
+    # Distributed arguments
+    parser.add_argument(
+        "--no_distributed",
+        action="store_true",
+        help="Disable JAX distributed initialization",
+    )
+
+    # Evaluation arguments
+    parser.add_argument(
+        "--max_eval_batches",
+        type=int,
+        default=10,
+        help="Maximum number of evaluation batches",
+    )
+    parser.add_argument(
+        "--steps_per_eval",
+        type=int,
+        default=500,  # Default to less frequent eval to reduce memory pressure from dual JIT
+        help="How often to run evaluation (in steps). Higher values reduce JIT compilation memory overhead.",
+    )
+    parser.add_argument(
+        "--per_device_eval_parallelism",
+        type=int,
+        default=-1,  # Same as training to potentially reuse XLA compilation cache
+        help="Number of examples to process per device during evaluation. "
+        "Default: -1 (same as training batch size).",
+    )
+    parser.add_argument(
+        "--no_eval",
+        action="store_true",
+        help="Disable evaluation completely to save memory",
+    )
+
+    return parser.parse_args()
+
+
+def get_model_config(args) -> LlavaOnevisionConfig:
+    """Get model configuration based on arguments with performance optimizations."""
+
+    # Determine gradient checkpointing setting
+    use_gradient_checkpointing = not args.no_gradient_checkpointing
+
+    # Determine attention backend (flash attention enabled by default)
+    use_flash = not args.no_flash_attention
+    if use_flash:
+        attn_backend = AttentionBackend.DEFAULT
+        flash_block_size = args.flash_attention_block_size
+    else:
+        attn_backend = AttentionBackend.VANILLA
+        flash_block_size = None
+
+    if args.use_small_model:
+        # Small model config for testing
+        logger.info("Using small model config for testing")
+        vision_config = SiglipVisionConfig(
+            hidden_size=64,
+            intermediate_size=256,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            image_size=384,
+            gradient_checkpointing=use_gradient_checkpointing,
+            use_flash_attention=use_flash,
+            attn_backend=attn_backend,
+            flash_attention_block_size=flash_block_size,
+        )
+        text_config = QwenConfig(
+            hidden_dim=128,
+            intermediate_dim=512,
+            num_layers=2,
+            num_heads=4,
+            num_kv_heads=2,
+            gradient_checkpointing=use_gradient_checkpointing,
+            attn_backend=attn_backend,
+            flash_attention_block_size=flash_block_size,
+        )
+    else:
+        # Custom config: SigLIP2 (from google/siglip2-so400m-patch16-384) + Qwen3-1.7B
+        # Vision: SigLIP2 so400m-patch16-384 config (using SigLIP architecture)
+        # LLM: Qwen3-1.7B config (not Qwen2)
+        logger.info("Using custom config: SigLIP2-so400m-patch16 + Qwen3-1.7B")
+
+        # SigLIP2 so400m-patch16-384 config (from HuggingFace)
+        vision_config = SiglipVisionConfig(
+            hidden_size=1152,
+            intermediate_size=4304,
+            num_hidden_layers=27,
+            num_attention_heads=16,
+            image_size=384,
+            patch_size=16,
+            gradient_checkpointing=use_gradient_checkpointing,
+            use_flash_attention=use_flash,
+            attn_backend=attn_backend,
+            flash_attention_block_size=flash_block_size,
+        )
+
+        # Qwen3-1.7B config (from HuggingFace Qwen/Qwen3-1.7B)
+        text_config = Qwen3Config(
+            hidden_dim=2048,
+            intermediate_dim=6144,
+            num_layers=28,
+            num_heads=16,
+            num_kv_heads=8,
+            max_seq_len=40960,
+            gradient_checkpointing=use_gradient_checkpointing,
+            attn_backend=attn_backend,
+            flash_attention_block_size=flash_block_size,
+            rope=DefaultRotaryEmbeddingsConfig(theta=1000000.0),
+            use_bias=False,
+            tie_word_embeddings=True,
+        )
+
+    config = LlavaOnevisionConfig(
+        vision_config=vision_config,
+        text_config=text_config,
+        gradient_checkpointing=use_gradient_checkpointing,
+    )
+
+    # Log optimization settings
+    logger.info(f"  Gradient checkpointing: {use_gradient_checkpointing}")
+    logger.info(f"  Flash attention: {use_flash}")
+    if use_flash:
+        logger.info(f"  Flash attention block size: {flash_block_size}")
+
+    return config
+
+
+def main():
+    args = parse_args()
+
+    # Set validation data to train data if not specified
+    if args.val_data is None:
+        args.val_data = args.train_data
+
+    logger.info("=" * 60)
+    logger.info("VLM Training Configuration")
+    logger.info("=" * 60)
+    logger.info(f"Training data: {args.train_data}")
+    logger.info(f"Validation data: {args.val_data}")
+    logger.info(f"Model: {args.model_name}")
+    logger.info(f"Initialize from HF: {args.initialize_from_hf}")
+    logger.info(f"Num train steps: {args.num_train_steps}")
+    logger.info(f"Batch size: {args.train_batch_size}")
+
+    # Log performance optimization settings
+    logger.info("-" * 60)
+    logger.info("Performance Optimizations:")
+    logger.info(f"  Mixed precision: {args.mp or 'disabled (float32)'}")
+    logger.info(f"  Flash attention: {not args.no_flash_attention}")
+    logger.info(f"  Freeze vision encoder: {args.freeze_vision_encoder}")
+    logger.info(f"  Per-device parallelism: {args.per_device_parallelism}")
+    logger.info(f"  FSDP axis: {args.fsdp_axis}")
+    logger.info(f"  Gradient checkpointing: {not args.no_gradient_checkpointing}")
+    logger.info("-" * 60)
+
+    # Create data config
+    data_config = ImageMixtureDatasetConfig(
+        cache_dir=args.cache_dir,
+        configs={
+            "train": ConversationDatasetSourceConfig(
+                train_urls=[f"file://{args.train_data}"],
+                validation_urls=[f"file://{args.val_data}"],
+                cache_dir=f"{args.cache_dir}/train",
+            ),
+        },
+        train_weights={"train": 1.0},
+        processor=args.model_name,
+        max_length=args.max_length,
+        use_cache=not args.no_cache,  # Use streaming mode if --no_cache is set
+    )
+
+    if args.no_cache:
+        logger.info("Using streaming mode (no caching) - images will be processed on-the-fly")
+
+    # Log dataset file count
+    logger.info("-" * 60)
+    logger.info("Dataset Files:")
+    for name, source_config in data_config.configs.items():
+        train_urls = source_config.urls_for_split("train")
+        val_urls = source_config.urls_for_split("validation")
+        logger.info(f"  {name}: {len(train_urls)} train file(s), {len(val_urls)} validation file(s)")
+    logger.info("-" * 60)
+
+    # Calculate num_train_steps based on epoch if specified
+    num_train_steps = args.num_train_steps
+    if args.epoch > 0:
+        # Build training datasets to get the actual dataset size
+        logger.info("Building training datasets to calculate epoch-based steps...")
+        train_datasets = data_config.training_sets()
+
+        # Calculate total dataset size from all training datasets
+        total_dataset_size = 0
+        for name, ds in train_datasets.items():
+            try:
+                ds_len = asyncio.run(ds.async_len())
+                total_dataset_size += ds_len
+                logger.info(f"  Dataset '{name}': {ds_len:,} samples")
+            except Exception as e:
+                logger.warning(f"Could not get length of dataset '{name}': {e}")
+
+        if total_dataset_size > 0:
+            # Calculate steps needed for the specified number of epochs
+            steps_per_epoch = total_dataset_size // args.train_batch_size
+            epoch_based_steps = steps_per_epoch * args.epoch
+            num_train_steps = epoch_based_steps
+            logger.info(
+                f"Epoch-based training: {args.epoch} epoch(s) = {num_train_steps:,} steps "
+                f"({total_dataset_size:,} samples / {args.train_batch_size} batch_size * {args.epoch} epochs)"
+            )
+        else:
+            logger.warning("Could not determine dataset size, using --num_train_steps instead")
+
+    # Create model config with optimizations
+    model_config = get_model_config(args)
+
+    # Create optimizer config
+    warmup_steps = int(num_train_steps * args.warmup_ratio)
+    optimizer_config = AdamConfig(
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        warmup=warmup_steps,
+    )
+
+    # Create tracker config
+    if args.wandb_project:
+        tracker_config = WandbConfig(
+            project=args.wandb_project,
+            name=args.wandb_run_name,
+        )
+    else:
+        tracker_config = NoopConfig()
+
+    # Create distributed config
+    distributed_config = DistributedConfig(initialize_jax_distributed=not args.no_distributed)
+
+    # Set checkpoint path
+    checkpointer_path = args.checkpointer_path or f"{args.output_dir}/checkpoints"
+    checkpointer_config = CheckpointerConfig(base_path=checkpointer_path)
+
+    # Parse FSDP axis (can be comma-separated for multi-axis)
+    fsdp_axis = args.fsdp_axis
+    if "," in fsdp_axis:
+        fsdp_axis = [ax.strip() for ax in fsdp_axis.split(",")]
+
+    # Convert mixed precision string to jmp.Policy
+    # jmp.get_policy accepts strings like "f32", "bf16", "bfloat16", or
+    # "compute=bfloat16,params=float32,output=float32"
+    if args.mp:
+        mp_policy = jmp.get_policy(args.mp)
+    else:
+        mp_policy = jmp.get_policy("f32")  # Default to full precision
+
+    # Create trainer config with performance optimizations
+    trainer_config = train_vlm.TrainerConfig(
+        num_train_steps=num_train_steps,
+        train_batch_size=args.train_batch_size,
+        per_device_parallelism=args.per_device_parallelism,
+        per_device_eval_parallelism=args.per_device_eval_parallelism,  # Smaller eval batch to save memory
+        max_eval_batches=args.max_eval_batches,
+        steps_per_eval=args.steps_per_eval,
+        tracker=tracker_config,
+        checkpointer=checkpointer_config,
+        distributed=distributed_config,
+        ray=RayConfig(auto_start_cluster=False),
+        # # FSDP configuration
+        # fsdp_axis=fsdp_axis,
+        # Mixed precision configuration
+        mp=mp_policy,
+    )
+
+    # Create main training config
+    # Note: When using custom config (SigLIP + Qwen3), we disable use_hf_model_config
+    # and initialize_from_hf since we'll load weights separately
+    use_custom_config = not args.use_small_model and not args.use_hf_model_config
+    config = train_vlm.TrainVLMConfig(
+        data=data_config,
+        model=model_config,
+        trainer=trainer_config,
+        optimizer=optimizer_config,
+        # Disable HF loading when using custom config - we'll load weights separately
+        initialize_from_hf=(
+            False
+            if use_custom_config
+            else (
+                args.initialize_from_hf
+                if args.initialize_from_hf
+                else args.model_name if args.use_hf_model_config else False
+            )
+        ),
+        use_hf_model_config=args.use_hf_model_config and not args.use_small_model,
+        hf_save_path=args.hf_save_path,
+        hf_save_steps=args.hf_save_steps,
+        # Custom weight loading paths for hybrid model
+        # Though it's SigLIP2, the architecture is the same as SigLIP, so we use the siglip config.
+        vision_checkpoint="google/siglip2-so400m-patch16-384" if use_custom_config else None,
+        llm_checkpoint="Qwen/Qwen3-1.7B" if use_custom_config else None,
+        # Evaluation control
+        no_eval=args.no_eval,
+        # Epoch control
+        epoch=args.epoch,
+    )
+
+    # Handle freezing if requested
+    if args.freeze_vision_encoder:
+        config = dataclasses.replace(config, freeze_vision_encoder=True)
+    if args.freeze_llm:
+        config = dataclasses.replace(config, freeze_llm=True)
+
+    logger.info("=" * 60)
+    logger.info("Starting VLM training...")
+    logger.info(f"Checkpoints will be saved to: {checkpointer_path}")
+    if args.hf_save_path:
+        logger.info(f"HF checkpoints will be saved to: {args.hf_save_path}")
+    if args.epoch > 0:
+        logger.info(f"Training for {args.epoch} epoch(s) ({num_train_steps:,} steps)")
+    else:
+        logger.info(f"Training for {num_train_steps:,} steps (no epoch limit)")
+
+    # Run training
+    train_vlm.main(config)
+
+    logger.info("Training completed!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lib/levanter/src/levanter/compat/hf_checkpoints.py b/lib/levanter/src/levanter/compat/hf_checkpoints.py
index 53dff0ae82..95fe39ab96 100644
--- a/lib/levanter/src/levanter/compat/hf_checkpoints.py
+++ b/lib/levanter/src/levanter/compat/hf_checkpoints.py
@@ -41,6 +41,7 @@
 from jax import ShapeDtypeStruct
 from jax._src.mesh import get_concrete_mesh
 from jax._src.partition_spec import PartitionSpec
+from jax.sharding import NamedSharding
 from jax.random import PRNGKey
 from jaxtyping import Array, PRNGKeyArray
 from tqdm_loggable.auto import tqdm
@@ -276,7 +277,10 @@ def _to_state_dict_with_dtype(
                 logger.debug(f"Skipping dtype conversion for non-floating point array {k} with dtype {v.dtype}")
 
     # deshard. We could be smarter here and use a process mesh or host offloading, but this is simpler for now
-    state_dict = jax.lax.with_sharding_constraint(state_dict, PartitionSpec())
+    mesh = get_concrete_mesh()
+    if mesh is not None and mesh.shape:
+        sharding = NamedSharding(mesh, PartitionSpec())
+        state_dict = jax.lax.with_sharding_constraint(state_dict, sharding)
 
     return state_dict
 
@@ -673,7 +677,13 @@ def load_pretrained(
 
         # Vocab: first we have to resize the vocab as loaded from the checkpoint
         tokenizer_Vocab = self.Vocab
-        Vocab = tokenizer_Vocab.resize(hf_config.vocab_size)
+        # For multimodal models like LlavaOnevision, vocab_size is in text_config
+        hf_vocab_size = getattr(hf_config, "vocab_size", None)
+        if hf_vocab_size is None and hasattr(hf_config, "text_config"):
+            hf_vocab_size = hf_config.text_config.vocab_size
+        if hf_vocab_size is None:
+            raise ValueError("Could not find vocab_size in hf_config or hf_config.text_config")
+        Vocab = tokenizer_Vocab.resize(hf_vocab_size)
 
         # TODO: in an ideal world, we would only load the part of the array we needed, but
         # AFAICT neither torch state dicts nor safetensors support this.
diff --git a/lib/levanter/src/levanter/data/image.py b/lib/levanter/src/levanter/data/image.py
new file mode 100644
index 0000000000..ae66342d1b
--- /dev/null
+++ b/lib/levanter/src/levanter/data/image.py
@@ -0,0 +1,3076 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Image data processing module for vision-language models like LLaVA OneVision.
+
+This module provides utilities for:
+- Loading and preprocessing images from various sources (URLs, HuggingFace datasets)
+- Processing conversation-format data with interleaved images and text
+- Converting images to model-ready tensors with proper axes
+- Batching and caching processed image-text pairs
+
+Conversation Format Example:
+{
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What is in this image?"}
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {"type": "text", "text": "This image shows..."}
+            ]
+        }
+    ],
+    "images": ["path/to/image.jpg"]  # or PIL Images, or URLs
+}
+"""
+
+import abc
+import asyncio
+import dataclasses
+import json
+import logging
+import math
+import os
+import threading
+import weakref
+from collections import OrderedDict
+from collections.abc import Iterable
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union, cast
+
+import braceexpand
+import datasets
+import equinox as eqx
+import fsspec
+import haliax as hax
+import jax
+import numpy
+import numpy as np
+from draccus import field
+from haliax import Axis, NamedArray
+from haliax.partitioning import ResourceMapping
+from jax.sharding import Mesh, PartitionSpec
+
+from levanter.data.mixture import MixtureDataset, StopStrategy
+from jaxtyping import PRNGKeyArray
+from typing_extensions import TypedDict
+
+from levanter.compat.hf_checkpoints import load_processor
+from levanter.data import AsyncDataset
+from levanter.data._preprocessor import BatchProcessor
+from levanter.data.dataset import EpochDataset, MappedAsyncDataset
+from levanter.data.loader import DataLoader, DataLoaderIterator, _Batch
+from levanter.data.sharded_datasource import (
+    ShardedDataSource,
+    UrlBackedShardedDataSource,
+    WrappedHFDataSource,
+    _sniff_format_for_dataset,
+)
+from levanter.schedule import IntSchedule
+from levanter.shapes import NamedShapeSpec, ShapeSpec
+from levanter.store.cache import CacheOptions, TreeCache, build_or_load_cache
+from levanter.utils.jax_utils import key_iterator
+from levanter.utils.logging import silence_transformer_nag
+
+silence_transformer_nag()
+from transformers import (  # noqa: E402
+    BatchFeature,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.image_processing_utils import select_best_resolution  # noqa: E402
+from transformers.image_utils import ImageInput, get_image_size, to_numpy_array  # noqa: E402
+from transformers.processing_utils import MultiModalData, ProcessingKwargs, Unpack  # noqa: E402
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput  # noqa: E402
+from transformers.utils import logging as transformers_logging  # noqa: E402
+from transformers.video_utils import VideoInput  # noqa: E402
+
+# Image loading dependencies - imported at module level for performance
+from io import BytesIO  # noqa: E402
+
+import requests  # noqa: E402
+from PIL import Image  # noqa: E402
+
+logger = logging.getLogger("levanter.data.image")
+
+
+class ImageTextUrlDataSource(UrlBackedShardedDataSource[dict]):
+    """
+    Dataset for image-text pairs from various file formats (JSON, JSONL, Parquet).
+
+    This data source reads image-text pairs where:
+    - image_key: points to the image data (can be path, URL, bytes, or HF dict format)
+    - text_key: points to the text description/caption
+
+    Supports HuggingFace-style image formats:
+    - {"bytes": <raw_bytes>}
+    - {"path": "path/to/image.jpg"}
+    - Direct path string or URL
+    """
+
+    def __init__(self, urls, image_key="image", text_key="text"):
+        super().__init__(urls)
+        self.image_key = image_key
+        self.text_key = text_key
+
+    def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
+        url = self._shard_name_to_url_mapping[shard_name]
+        i = 0
+        with fsspec.open(url, "r", compression="infer") as f:
+            format = _sniff_format_for_dataset(url)
+            match format:
+                case ".jsonl":
+                    for line in f:
+                        if i >= row:
+                            data = json.loads(line)
+                            yield {
+                                "image": data[self.image_key],
+                                "text": data[self.text_key],
+                            }
+                        i += 1
+                case ".json":
+                    data = json.load(f)
+                    for doc in data[row:]:
+                        yield {
+                            "image": doc[self.image_key],
+                            "text": doc[self.text_key],
+                        }
+                case _:
+                    raise ValueError(f"Unknown format {format}")
+
+
+class ImageConversationUrlDataSource(UrlBackedShardedDataSource[dict]):
+    """
+    Dataset for conversation-format image-text data (VLM training format).
+
+    This data source reads conversation data with interleaved images and text,
+    used for vision-language model training like LLaVA.
+
+    Expected data format:
+    {
+        "messages": [
+            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]},
+            {"role": "assistant", "content": [{"type": "text", "text": "..."}]}
+        ],
+        "images": ["path/to/image.jpg"]  # or PIL Images, URLs, or bytes
+    }
+    """
+
+    def __init__(self, urls, messages_key="messages", images_key="images"):
+        super().__init__(urls)
+        self.messages_key = messages_key
+        self.images_key = images_key
+
+    def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
+        url = self._shard_name_to_url_mapping[shard_name]
+        i = 0
+        format = _sniff_format_for_dataset(url)
+        if format == ".parquet":
+            # Handle parquet files
+            import pyarrow.parquet as pq
+
+            with fsspec.open(url, "rb") as f:
+                table = pq.read_table(f)
+                data = table.to_pydict()
+                num_rows = table.num_rows
+                for idx in range(row, num_rows):
+                    yield {
+                        "messages": data[self.messages_key][idx],
+                        "images": data.get(self.images_key, [[]])[idx],
+                    }
+        else:
+            with fsspec.open(url, "r", compression="infer") as f:
+                match format:
+                    case ".jsonl":
+                        for line in f:
+                            if i >= row:
+                                data = json.loads(line)
+                                yield {
+                                    "messages": data[self.messages_key],
+                                    "images": data.get(self.images_key, []),
+                                }
+                            i += 1
+                    case ".json":
+                        data = json.load(f)
+                        for doc in data[row:]:
+                            yield {
+                                "messages": doc[self.messages_key],
+                                "images": doc.get(self.images_key, []),
+                            }
+                    case _:
+                        raise ValueError(f"Unknown format {format}")
+
+
+class CustomVLMProcessor(ProcessorMixin):
+    """
+    Custom VLM processor that combines components from different sources.
+
+    This allows using a different tokenizer (e.g., Qwen3-1.7B) while keeping
+    the image/video processing from the original processor. Instead of mutating
+    the original processor's tokenizer, this creates a new processor instance
+    that properly combines the components.
+    """
+
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    video_processor_class = "AutoVideoProcessor"
+
+    # Critical tokens for validation when combining processors
+    CRITICAL_SPECIAL_TOKENS = ["<|im_start|>", "<|im_end|>"]
+    CRITICAL_ROLE_TOKENS = ["assistant", "user", "system"]
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        video_processor=None,
+        *,
+        chat_template=None,
+        image_token="<image>",
+        video_token="<video>",
+        num_image_tokens=None,
+        vision_feature_select_strategy=None,
+        **kwargs,
+    ):
+        """
+        Initialize the custom processor with combined components.
+
+        Args:
+            image_processor: Image processor from the original VLM processor
+            tokenizer: New tokenizer to use (e.g., from Qwen3-1.7B)
+            video_processor: Optional video processor from the original VLM processor
+            chat_template: Chat template for formatting conversations
+            image_token: Token used for image placeholders
+            video_token: Token used for video placeholders
+            num_image_tokens: Number of tokens per image
+            vision_feature_select_strategy: Strategy for selecting vision features
+        """
+        self.num_image_tokens = num_image_tokens
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
+        self.video_token = video_token
+        self.image_token_id = tokenizer.convert_tokens_to_ids(image_token)
+        self.video_token_id = tokenizer.convert_tokens_to_ids(video_token)
+
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+
+    @classmethod
+    def from_processor_and_tokenizer(
+        cls,
+        original_processor: ProcessorMixin,
+        new_tokenizer: PreTrainedTokenizerBase,
+    ) -> "CustomVLMProcessor":
+        """
+        Create a CustomVLMProcessor by combining original processor components with a new tokenizer.
+
+        This factory method validates that the new tokenizer is compatible with the original
+        processor's tokenizer, then creates a new processor instance that combines them.
+
+        Args:
+            original_processor: The original VLM processor (e.g., LlavaOnevisionProcessor)
+            new_tokenizer: The new tokenizer to use (e.g., from Qwen3-1.7B)
+
+        Returns:
+            A new CustomVLMProcessor instance
+
+        Raises:
+            AssertionError: If tokenizers are incompatible (vocab_size, critical tokens, etc.)
+            NotImplementedError: If the new tokenizer type is not supported
+        """
+        old_tokenizer = original_processor.tokenizer
+
+        # Validate vocab_size matches
+        assert old_tokenizer.vocab_size == new_tokenizer.vocab_size, (
+            f"Tokenizer vocab size mismatch: processor has {old_tokenizer.vocab_size}, "
+            f"new tokenizer has {new_tokenizer.vocab_size}"
+        )
+
+        # Validate critical special tokens have the same IDs
+        for token in cls.CRITICAL_SPECIAL_TOKENS:
+            old_id = old_tokenizer.convert_tokens_to_ids(token)
+            new_id = new_tokenizer.convert_tokens_to_ids(token)
+            assert old_id == new_id, (
+                f"Critical special token '{token}' ID mismatch: " f"processor has {old_id}, new tokenizer has {new_id}"
+            )
+
+        # Validate role tokens have the same IDs
+        for token in cls.CRITICAL_ROLE_TOKENS:
+            old_id = old_tokenizer.convert_tokens_to_ids(token)
+            new_id = new_tokenizer.convert_tokens_to_ids(token)
+            assert old_id == new_id, (
+                f"Critical role token '{token}' ID mismatch: " f"processor has {old_id}, new tokenizer has {new_id}"
+            )
+
+        # Validate eos_token_id matches
+        assert old_tokenizer.eos_token_id == new_tokenizer.eos_token_id, (
+            f"eos_token_id mismatch: processor has {old_tokenizer.eos_token_id}, "
+            f"new tokenizer has {new_tokenizer.eos_token_id}"
+        )
+
+        # Detect Qwen3 tokenizer and use appropriate image/video tokens
+        # Qwen3 has <|image_pad|>, <|video_pad|>, <think>, </think> tokens
+        qwen3_image_token = "<|image_pad|>"
+        qwen3_image_token_id = new_tokenizer.convert_tokens_to_ids(qwen3_image_token)
+        is_qwen3 = qwen3_image_token_id != new_tokenizer.unk_token_id
+
+        if is_qwen3:
+            image_token = "<|image_pad|>"
+            video_token = "<|video_pad|>"
+            logger.info(f"Using Qwen3 tokens: image={image_token}, video={video_token}")
+        else:
+            raise NotImplementedError(f"Tokenizer {type(new_tokenizer).__name__} is not supported")
+
+        result = cls(
+            image_processor=original_processor.image_processor,
+            tokenizer=new_tokenizer,
+            video_processor=getattr(original_processor, "video_processor", None),
+            chat_template=getattr(original_processor, "chat_template", None),
+            image_token=image_token,
+            video_token=video_token,
+            num_image_tokens=getattr(original_processor, "num_image_tokens", None),
+            vision_feature_select_strategy=getattr(original_processor, "vision_feature_select_strategy", None),
+        )
+
+        logger.info(
+            f"Created CustomVLMProcessor with {type(new_tokenizer).__name__} "
+            f"(vocab_size={new_tokenizer.vocab_size})"
+        )
+
+        return result
+
+
+def expand_urls_with_folder_support(urls: List[str]) -> List[str]:
+    """Expand URLs/paths to a list of file paths.
+
+    Supports:
+    - Single file paths: /path/to/file.parquet
+    - Glob patterns: /path/to/*.parquet
+    - Directories: /path/to/folder/ (will find all *.parquet files recursively)
+    - file:// prefixed paths: file:///path/to/folder/
+    - Brace expansion: /path/to/{train,val}*.parquet
+
+    Args:
+        urls: List of URLs/paths that may include directories, globs, or brace patterns
+
+    Returns:
+        List of expanded file paths
+    """
+
+    def expand_single_path(url: str) -> List[str]:
+        """Expand a single path/url to a list of file paths."""
+        # Handle file:// prefix
+        if url.startswith("file://"):
+            local_path = url[7:]  # Remove file:// prefix
+            prefix = "file://"
+        else:
+            local_path = url
+            prefix = ""
+
+        # Check if it's a directory (without glob pattern)
+        if os.path.isdir(local_path):
+            # Find all parquet files in the directory (recursively)
+            parquet_files = []
+            for root, dirs, files in os.walk(local_path):
+                for f in files:
+                    if f.endswith(".parquet"):
+                        full_path = os.path.join(root, f)
+                        parquet_files.append(f"{prefix}{full_path}")
+            parquet_files.sort()  # Sort for deterministic ordering
+            if parquet_files:
+                logger.info(f"Found {len(parquet_files)} parquet files in directory: {local_path}")
+            else:
+                logger.warning(f"No parquet files found in directory: {local_path}")
+            return parquet_files
+        elif "*" in local_path:
+            # Use fsspec for glob expansion
+            fs = fsspec.core.url_to_fs(url)[0]
+            globbed = fs.glob(url)
+            return globbed if globbed else [url]
+        else:
+            # Single file
+            return [url]
+
+    result = []
+    for pat in urls:
+        for url in braceexpand.braceexpand(pat):
+            result.extend(expand_single_path(url))
+
+    return result
+
+
+# Type definitions for conversation data
+ConversationMessage = TypedDict(
+    "ConversationMessage",
+    {
+        "role": str,  # "user", "assistant", "system"
+        "content": List[Dict[str, Any]],  # [{"type": "image"}, {"type": "text", "text": "..."}]
+    },
+)
+
+ConversationDict = TypedDict(
+    "ConversationDict",
+    {
+        "messages": List[ConversationMessage],
+        "images": List[Any],  # List of images (PIL, paths, URLs, or bytes)
+    },
+    total=False,
+)
+
+
+# Type definitions for processed image-text data
+# pixel_values and image_sizes are optional to support text-only examples
+class ImageTextDict(TypedDict, total=False):
+    """Processed image-text data for VLM training.
+
+    For text-only examples, pixel_values and image_sizes will be None.
+    """
+
+    pixel_values: Optional[np.ndarray]  # (TOTAL_PATCHES, channels, height, width) - FIXED shape, padded
+    input_ids: np.ndarray  # (seq_len,)
+    attention_mask: np.ndarray  # (seq_len,)
+    image_sizes: Optional[np.ndarray]  # (num_images, 2) or None - original image sizes (H, W)
+    loss_mask: np.ndarray  # (seq_len,) float32 - 1.0 for compute loss, 0.0 for ignore
+    # Grid mask for fixed-shape processing - indicates which patches are valid (not padding)
+    grid_mask: Optional[np.ndarray]  # (TOTAL_PATCHES,) boolean - True for valid patches
+    # Unpad indices for anyres processing
+    unpad_indices: Optional[np.ndarray]  # (num_image_tokens,) - indices for unpadding image features
+
+
+ImageTextDict_exemplar: ImageTextDict = {
+    "pixel_values": np.zeros((1, 3, 384, 384), dtype=np.float32),
+    "input_ids": np.zeros((1,), dtype=np.int32),
+    "attention_mask": np.zeros((1,), dtype=np.int32),
+    "image_sizes": np.zeros((1, 2), dtype=np.int32),
+    "loss_mask": np.zeros((1,), dtype=np.float32),
+    "grid_mask": None,  # Always included, may be None
+    "unpad_indices": None,  # Always included, may be None
+}
+
+
+def load_image_from_path_or_url(path_or_url: str) -> Image.Image:
+    """Load an image from a local path, URL, or cloud storage.
+
+    Args:
+        path_or_url: Local file path, URL, or cloud storage path (gs://, s3://) to the image
+
+    Returns:
+        PIL Image in RGB format
+    """
+    if path_or_url.startswith(("http://", "https://")):
+        response = requests.get(path_or_url, timeout=30)
+        response.raise_for_status()
+        image = Image.open(BytesIO(response.content))
+    elif path_or_url.startswith(("gs://", "s3://")):
+        with fsspec.open(path_or_url, "rb") as f:
+            image = Image.open(f)
+            image.load()
+    else:
+        image = Image.open(path_or_url)
+
+    return image.convert("RGB")
+
+
+def load_image(image_data: Any) -> Image.Image:
+    """Load an image from various formats.
+
+    Args:
+        image_data: Can be PIL Image, numpy array, path string, URL, or HF dict with bytes
+
+    Returns:
+        PIL Image in RGB format
+    """
+    if isinstance(image_data, Image.Image):
+        return image_data.convert("RGB")
+    elif isinstance(image_data, str):
+        return load_image_from_path_or_url(image_data)
+    elif isinstance(image_data, np.ndarray):
+        return Image.fromarray(image_data).convert("RGB")
+    elif isinstance(image_data, dict):
+        if "bytes" in image_data:
+            # HuggingFace dataset format
+            return Image.open(BytesIO(image_data["bytes"])).convert("RGB")
+        elif "path" in image_data:
+            return load_image_from_path_or_url(image_data["path"])
+        else:
+            raise ValueError(f"Unknown image dict format: {image_data.keys()}")
+    else:
+        raise ValueError(f"Unsupported image type: {type(image_data)}")
+
+
+def _extract_anyres_params(
+    processor: ProcessorMixin,
+) -> Tuple[Optional[List[List[int]]], int, Optional[int], Optional[int]]:
+    """Extract grid_pinpoints and related params from HF processor for anyres support.
+
+    Args:
+        processor: HuggingFace processor (e.g., LlavaOnevisionProcessor)
+
+    Returns:
+        Tuple of (grid_pinpoints, patch_size, vision_feature_height, max_num_patches)
+    """
+    image_processor = getattr(processor, "image_processor", None)
+    if image_processor is None:
+        return None, 384, None, None
+
+    grid_pinpoints = getattr(image_processor, "image_grid_pinpoints", None)
+    size_dict = getattr(image_processor, "size", {})
+    patch_size = size_dict.get("height", 384) if isinstance(size_dict, dict) else 384
+    vision_feature_height = patch_size // 14
+    max_num_patches = None
+
+    # Try to get max_num_patches from vision_aspect_ratio (LLaVA-specific)
+    vision_aspect_ratio = getattr(image_processor, "vision_aspect_ratio", None)
+    if vision_aspect_ratio and isinstance(vision_aspect_ratio, str) and "anyres_max_" in vision_aspect_ratio:
+        try:
+            max_num_patches = int(vision_aspect_ratio.split("anyres_max_")[-1])
+        except (ValueError, IndexError):
+            pass
+
+    # Fallback: compute from grid_pinpoints if available
+    if max_num_patches is None and grid_pinpoints:
+        max_resolution = max(max(h, w) for h, w in grid_pinpoints)
+        max_patches_per_dim = max_resolution // patch_size
+        max_num_patches = max_patches_per_dim * max_patches_per_dim  # +1 for base is added in _pad_pixel_values
+
+    return grid_pinpoints, patch_size, vision_feature_height, max_num_patches
+
+
+class BatchImageProcessor(BatchProcessor[Dict[str, Any], ImageTextDict]):
+    """
+    A batch processor that converts conversation-format data into model-ready inputs.
+
+    This processor handles the conversation format used by VLMs like LLaVA:
+    - Applies chat template to convert messages to text with image placeholders
+    - Processes images using the HuggingFace processor
+    - Creates loss_mask for training (1.0 for assistant tokens, 0.0 for others)
+
+    Input format:
+    {
+        "messages": [
+            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]},
+            {"role": "assistant", "content": [{"type": "text", "text": "..."}]}
+        ],
+        "images": [<image_data>]  # PIL, path, URL, or HF bytes dict
+    }
+
+    Output format (ImageTextDict):
+    {
+        "pixel_values": np.ndarray or None,
+            # Shape: (TOTAL_PATCHES, C, H, W) where TOTAL_PATCHES = max_num_patches + 1
+            # Preprocessed image patches ready for the vision encoder. Padded to fixed size
+            # for JIT compatibility. For single image: includes base patch + anyres grid patches.
+            # For multiple images: only base patches (one per image). None for text-only examples.
+
+        "input_ids": np.ndarray,
+            # Shape: (seq_len,) dtype: int32
+            # Tokenized text sequence with image placeholder tokens inserted where images appear.
+            # The image placeholder token (e.g., <|image_pad|>) is repeated for each image feature.
+
+        "attention_mask": np.ndarray,
+            # Shape: (seq_len,) dtype: int32
+            # Binary mask indicating valid tokens (1) vs padding tokens (0).
+            # Used to prevent attention to padding positions.
+
+        "image_sizes": np.ndarray or None,
+            # Shape: (num_images, 2) dtype: int32
+            # Original image dimensions as (height, width) for each image.
+            # Used by the model for spatial unpadding in anyres processing. None for text-only.
+
+        "loss_mask": np.ndarray,
+            # Shape: (seq_len,) dtype: float32
+            # Training loss mask for causal language modeling. 1.0 for assistant response
+            # tokens that should contribute to the loss; 0.0 for all other tokens
+            # (system, user, special) that should be ignored during training.
+
+        "grid_mask": np.ndarray or None,
+            # Shape: (TOTAL_PATCHES,) dtype: bool
+            # Boolean mask indicating which patches are real (True) vs padding (False).
+            # Enables fixed-shape tensors for JIT while tracking actual patch count.
+            # None if max_num_patches is not configured.
+
+        "unpad_indices": np.ndarray or None,
+            # Shape: (num_image_tokens,) dtype: int32
+            # Index mapping from HuggingFace's unpadded feature order to Levanter's padded order.
+            # Used to reorder vision features after encoding to match HF's spatial unpadding.
+            # Only computed for single-image anyres case; None otherwise.
+    }
+    """
+
+    def __init__(
+        self,
+        processor: ProcessorMixin,
+        *,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        max_length: int = 2048,
+        padding: bool = True,
+        messages_key: str = "messages",
+        images_key: str = "images",
+        add_generation_prompt: bool = False,
+        mask_prompt: bool = True,
+        max_num_patches: int = 9,
+        override_resources: Optional[Dict[str, Any]] = None,
+        # Parameters for computing grid_mask for JIT-compatible VLM training
+        grid_pinpoints: Optional[List[List[int]]] = None,
+        patch_size: int = 384,
+        vision_feature_height: Optional[int] = None,
+    ):
+        """
+        Initialize the BatchImageProcessor.
+
+        Args:
+            processor: HuggingFace processor (e.g., AutoProcessor.from_pretrained(...))
+            tokenizer: Optional tokenizer to replace the processor's tokenizer.
+                       Use this to ensure tokenization matches the LLM's tokenizer (e.g., Qwen3-1.7B).
+                       If provided, critical special tokens will be verified for consistency.
+            max_length: Maximum sequence length for tokenization
+            padding: Whether to pad sequences to max_length
+            messages_key: Key for messages list in input dictionaries
+            images_key: Key for images list in input dictionaries
+            add_generation_prompt: Whether to add generation prompt at the end
+            mask_prompt: Whether to mask non-assistant tokens (set loss_mask to 0.0)
+            override_resources: Optional resource overrides
+            grid_pinpoints: List of grid resolutions for anyres processing, e.g., [[384,384], [768,384], ...]
+            patch_size: Size of each image patch (default 384)
+            vision_feature_height: Vision encoder output tokens per spatial dim (e.g., 27 for 384/14)
+            max_num_patches: Maximum number of patches for anyres constraint (e.g., 9 for anyres_max_9)
+        """
+        self.processor = processor
+        self.max_length = max_length
+        self.padding = padding
+        self.messages_key = messages_key
+        self.images_key = images_key
+        self.add_generation_prompt = add_generation_prompt
+        self.mask_prompt = mask_prompt
+        self.override_resources = override_resources
+
+        # Parameters for computing grid_mask for JIT-compatible VLM training
+        self.grid_pinpoints = grid_pinpoints
+        self.patch_size = patch_size
+        self.vision_feature_height = vision_feature_height
+        self.max_num_patches = max_num_patches
+
+        # Pre-compute grid_pinpoints arrays for vectorized _compute_grid_shape
+        if grid_pinpoints is not None:
+            self._grid_h = np.array([p[0] for p in grid_pinpoints], dtype=np.float64)
+            self._grid_w = np.array([p[1] for p in grid_pinpoints], dtype=np.float64)
+            self._grid_area = self._grid_h * self._grid_w
+        else:
+            self._grid_h = None
+            self._grid_w = None
+            self._grid_area = None
+
+        # Create a custom processor with the new tokenizer if specified
+        if tokenizer is not None:
+            self.processor = CustomVLMProcessor.from_processor_and_tokenizer(processor, tokenizer)
+
+        # Cache padding mode for __call__
+        self._padding_mode = "max_length" if self.padding else False
+
+        # Eagerly cache token IDs for _create_loss_mask (after any tokenizer replacement)
+        final_tokenizer = self.processor.tokenizer
+        self._cached_im_start_id: int = final_tokenizer.convert_tokens_to_ids("<|im_start|>")
+        self._cached_im_end_id: int = final_tokenizer.convert_tokens_to_ids("<|im_end|>")
+        assistant_ids = final_tokenizer.encode("assistant", add_special_tokens=False)
+        self._cached_num_assistant_tokens: int = len(assistant_ids)
+        self._cached_assistant_token_ids_array: np.ndarray = np.array(assistant_ids, dtype=np.int32)
+
+    def get_token_ids(self) -> Dict[str, Optional[int]]:
+        """Get current token IDs from the processor.
+
+        Returns a dict with keys:
+        - image_token_id: Token ID for <image> placeholder
+        - video_token_id: Token ID for <video> placeholder
+        - vocab_size: Vocabulary size of the tokenizer
+
+        These values can be used to update model config (e.g., LlavaOnevisionConfig)
+        when the tokenizer has been replaced.
+
+        Example:
+            >>> bp = BatchImageProcessor(processor, tokenizer=qwen3_tokenizer)
+            >>> token_ids = bp.get_token_ids()
+            >>> # Update model config
+            >>> model_config = dataclasses.replace(
+            ...     model_config,
+            ...     image_token_index=token_ids["image_token_id"],
+            ...     video_token_index=token_ids["video_token_id"],
+            ... )
+        """
+        return {
+            "image_token_id": getattr(self.processor, "image_token_id", None),
+            "video_token_id": getattr(self.processor, "video_token_id", None),
+            "vocab_size": self.processor.tokenizer.vocab_size,
+        }
+
+    def _compute_grid_shape(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Compute (gh, gw) grid shape for an image using vectorized numpy operations.
+
+        This is used for pre-computing grid shapes during CPU preprocessing so they can be
+        passed as concrete Python ints to pack_image_features, enabling JIT compilation.
+
+        Args:
+            image_size: (height, width) of the original image
+
+        Returns:
+            (gh, gw) grid dimensions as Python ints
+        """
+        if self._grid_h is None or self.patch_size is None:
+            return (1, 1)  # Default for no anyres
+
+        orig_h, orig_w = image_size
+        orig_area = orig_h * orig_w
+
+        # Vectorized computation of scales for all grid resolutions
+        # scale = min(w/orig_w, h/orig_h) for each resolution
+        scales = np.minimum(self._grid_w / orig_w, self._grid_h / orig_h)
+
+        # Compute scaled dimensions and effective area
+        scaled_h = (orig_h * scales).astype(np.int64)
+        scaled_w = (orig_w * scales).astype(np.int64)
+        eff = np.minimum(scaled_h * scaled_w, orig_area)
+
+        # Compute waste (area not used)
+        waste = self._grid_area - eff
+
+        # Combined score: maximize eff first, then minimize waste
+        # Use large multiplier to ensure eff dominates
+        scores = eff.astype(np.float64) * 1e12 - waste
+        best_idx = int(np.argmax(scores))
+
+        assert self.grid_pinpoints is not None
+        best_h, best_w = self.grid_pinpoints[best_idx]
+        gh = best_h // self.patch_size
+        gw = best_w // self.patch_size
+        return (gh, gw)
+
+    def _compute_unpad_indices_for_image(
+        self,
+        orig_height: int,
+        orig_width: int,
+        patches_height: int,
+        patches_width: int,
+        scale_height: int,
+        scale_width: int,
+        features_per_patch: int,
+    ) -> np.ndarray:
+        """Compute indices to reorder Levanter's padded features to HF's unpadded order.
+
+        HF's pack_image_features applies spatial unpadding based on original image aspect ratio.
+        This function computes the mapping from HF's feature positions to Levanter's sequential
+        feature layout.
+
+        Args:
+            orig_height: Original image height
+            orig_width: Original image width
+            patches_height: Number of patches per tile in height (e.g., 27)
+            patches_width: Number of patches per tile in width (e.g., 27)
+            scale_height: Number of tiles in height (e.g., 3 for 3x3 grid)
+            scale_width: Number of tiles in width (e.g., 3 for 3x3 grid)
+            features_per_patch: Features per patch/tile (e.g., 729)
+
+        Returns:
+            unpad_indices: Array of shape (num_unpadded_features,) where
+                          unpad_indices[i] = Levanter index for HF position i
+        """
+        # Base features are identity mapping (base patch is always first)
+        base_indices = np.arange(features_per_patch, dtype=np.int32)
+
+        # Grid spatial dimensions after combining all tiles
+        curr_height = patches_height * scale_height
+        curr_width = patches_width * scale_width
+
+        # Compute unpadding bounds based on original aspect ratio
+        # This matches HF's unpad_image logic
+        original_aspect_ratio = orig_width / orig_height
+        current_aspect_ratio = curr_width / curr_height
+
+        if original_aspect_ratio > current_aspect_ratio:
+            # Wider image - remove top/bottom padding
+            scale_factor = curr_width / orig_width
+            new_height = int(round(orig_height * scale_factor, 7))
+            padding = (curr_height - new_height) // 2
+            row_start = padding
+            row_end = curr_height - padding
+            col_start = 0
+            col_end = curr_width
+        else:
+            # Taller image - remove left/right padding
+            scale_factor = curr_height / orig_height
+            new_width = int(round(orig_width * scale_factor, 7))
+            padding = (curr_width - new_width) // 2
+            row_start = 0
+            row_end = curr_height
+            col_start = padding
+            col_end = curr_width - padding
+
+        # Build mapping from HF grid position to Levanter grid index (vectorized)
+        # HF order: row-major through unpadded region
+        # Levanter order: patch-by-patch (tile-by-tile), then row-major within each patch
+
+        # Create grid of all (row, col) positions in the unpadded region
+        rows = np.arange(row_start, row_end, dtype=np.int32)
+        cols = np.arange(col_start, col_end, dtype=np.int32)
+        row_grid, col_grid = np.meshgrid(rows, cols, indexing="ij")
+        row_flat = row_grid.ravel()
+        col_flat = col_grid.ravel()
+
+        # Compute tile indices and local positions (vectorized)
+        tile_rows = row_flat // patches_height
+        tile_cols = col_flat // patches_width
+        local_rows = row_flat % patches_height
+        local_cols = col_flat % patches_width
+
+        # Compute Levanter indices (vectorized)
+        tile_indices = tile_rows * scale_width + tile_cols
+        local_indices = local_rows * patches_width + local_cols
+        grid_indices = features_per_patch + tile_indices * features_per_patch + local_indices
+
+        return np.concatenate([base_indices, grid_indices])
+
+    def _pad_pixel_values(self, pixel_values: np.ndarray, valid_patches: int) -> Tuple[np.ndarray, np.ndarray]:
+        """Pad pixel_values to fixed TOTAL_PATCHES size and create grid_mask.
+
+        Args:
+            pixel_values: Image patches array of shape (actual_patches, C, H, W)
+            valid_patches: Number of patches to mark as valid in grid_mask
+
+        Returns:
+            Tuple of (padded_pixel_values, grid_mask)
+        """
+        assert self.max_num_patches is not None
+        total_patches = self.max_num_patches + 1  # +1 for base patch
+        actual_patches = pixel_values.shape[0]
+
+        # Create grid_mask: True for valid patches, False for padding
+        grid_mask = np.zeros(total_patches, dtype=np.bool_)
+        grid_mask[:valid_patches] = True
+
+        # Pad or truncate pixel_values to fixed size
+        if actual_patches < total_patches:
+            pad_size = total_patches - actual_patches
+            padding = np.zeros((pad_size,) + pixel_values.shape[1:], dtype=pixel_values.dtype)
+            pixel_values = np.concatenate([pixel_values, padding], axis=0)
+        elif actual_patches > total_patches:
+            pixel_values = pixel_values[:total_patches]
+            grid_mask[:] = True
+
+        return pixel_values, grid_mask
+
+    def _create_loss_mask(self, input_ids: np.ndarray) -> np.ndarray:
+        """
+        Create loss mask for training by identifying assistant response tokens.
+
+        For causal LM training, we only compute loss on assistant responses.
+        Returns a float32 mask where 1.0 indicates tokens that should contribute
+        to the loss, and 0.0 indicates tokens that should be ignored.
+
+        This is an efficient vectorized implementation that works directly on token IDs
+        without decoding, similar to HuggingFace's return_assistant_tokens_mask.
+
+        The algorithm identifies assistant response spans by looking for:
+            <|im_start|>assistant{whitespace}...content...<|im_end|>
+
+        Uses cumsum trick for O(n) complexity without Python loops.
+
+        Args:
+            input_ids: Token IDs array
+
+        Returns:
+            Loss mask array (float32) with 1.0 for valid positions, 0.0 for masked
+        """
+        if not self.mask_prompt:
+            return np.ones(len(input_ids), dtype=np.float32)
+
+        n = len(input_ids)
+        num_ast = self._cached_num_assistant_tokens
+        empty_mask = np.zeros(n, dtype=np.float32)
+
+        if n < 3:
+            return empty_mask
+
+        # Find all <|im_start|> positions and filter to valid ones
+        im_start_positions = np.where(input_ids == self._cached_im_start_id)[0]
+        valid_positions = im_start_positions[im_start_positions + 1 + num_ast <= n]
+        if len(valid_positions) == 0:
+            return empty_mask
+
+        # Vectorized check for assistant tokens following <|im_start|>
+        offsets = np.arange(1, num_ast + 1)
+        check_indices = valid_positions[:, None] + offsets
+        check_tokens = input_ids[check_indices]
+        matches = np.all(check_tokens == self._cached_assistant_token_ids_array, axis=1)
+        pattern_starts = valid_positions[matches]
+        if len(pattern_starts) == 0:
+            return empty_mask
+
+        # Find all <|im_end|> positions
+        im_end_positions = np.where(input_ids == self._cached_im_end_id)[0]
+        if len(im_end_positions) == 0:
+            return empty_mask
+
+        # Content starts after: <|im_start|> + assistant_tokens
+        # Note: The \n after "assistant" is INCLUDED in loss (matches HF behavior)
+        content_starts = pattern_starts + 1 + num_ast
+        valid_mask = content_starts < n
+        content_starts = content_starts[valid_mask]
+        if len(content_starts) == 0:
+            return empty_mask
+
+        # Use searchsorted to find matching <|im_end|> for each content_start
+        end_indices = np.searchsorted(im_end_positions, content_starts, side="left")
+        valid_ends = end_indices < len(im_end_positions)
+        content_starts = content_starts[valid_ends]
+        end_indices = end_indices[valid_ends]
+        if len(content_starts) == 0:
+            return empty_mask
+
+        # End positions include <|im_end|> token
+        end_positions = im_end_positions[end_indices] + 1
+
+        # Use diff + cumsum to create interval mask efficiently
+        diff = np.zeros(n + 1, dtype=np.int8)
+        np.add.at(diff, content_starts, 1)
+        np.add.at(diff, end_positions, -1)
+        mask = np.cumsum(diff[:-1]) > 0
+
+        return mask.astype(np.float32)
+
+    def __call__(self, batch: Sequence[Dict[str, Any]]) -> Sequence[ImageTextDict]:
+        """
+        Process a batch of conversation data.
+
+        Args:
+            batch: Sequence of conversation dictionaries with 'messages' and 'images' keys.
+
+        Returns:
+            Sequence of processed ImageTextDict
+        """
+        batch_size = len(batch)
+        all_images: list = []
+        all_texts: list[str] = []
+        images_per_example: list[int] = []
+
+        # Collect all images and texts - avoid repeated dict.get calls
+        for item in batch:
+            messages = item.get(self.messages_key, [])
+            images_data = item.get(self.images_key, [])
+
+            # Load all images for this example
+            all_images.extend(load_image(img) for img in images_data)
+            images_per_example.append(len(images_data))
+
+            # Apply chat template to get the text with image placeholders
+            all_texts.append(
+                self.processor.apply_chat_template(
+                    messages,
+                    add_generation_prompt=self.add_generation_prompt,
+                )
+            )
+
+        # Process all images and texts together in one call
+        if all_images:
+            processed: BatchFeature = self.processor(
+                images=all_images,
+                text=all_texts,
+                return_tensors="np",
+                padding=self._padding_mode,
+                max_length=self.max_length,
+                truncation=True,
+            )
+        else:
+            # Text-only processing
+            processed: BatchFeature = self.processor(
+                text=all_texts,
+                return_tensors="np",
+                padding=self._padding_mode,
+                max_length=self.max_length,
+                truncation=True,
+            )
+
+        # Extract and convert batch arrays once (avoid per-example astype calls)
+        input_ids_batch = processed["input_ids"].astype(np.int32)
+        attention_mask_batch = processed["attention_mask"].astype(np.int32)
+
+        # Pre-extract pixel_values and image_sizes if available
+        has_pixel_values = "pixel_values" in processed
+        has_image_sizes = "image_sizes" in processed
+        pv = processed["pixel_values"] if has_pixel_values else None
+        img_sizes = processed["image_sizes"].astype(np.int32) if has_image_sizes else None
+
+        # Pre-compute cumulative image indices for fast slicing
+        # cumsum gives end indices: [n0, n0+n1, n0+n1+n2, ...]
+        cum_images = np.cumsum(images_per_example)
+
+        # Build output list
+        out: list[ImageTextDict] = []
+        for i in range(batch_size):
+            input_ids = input_ids_batch[i]
+            num_images = images_per_example[i]
+
+            # Calculate image index range for this example
+            pv_end = cum_images[i]
+            pv_start = pv_end - num_images
+
+            # Get pixel_values for this example and create grid_mask
+            grid_mask = None
+            unpad_indices = None
+            if num_images > 0 and has_pixel_values:
+                assert pv is not None  # Guarded by has_pixel_values
+                if num_images == 1:
+                    # Single image: use anyres with all patches
+                    pixel_values = pv[pv_start]
+                    if self.max_num_patches is not None:
+                        pixel_values, grid_mask = self._pad_pixel_values(
+                            pixel_values, valid_patches=pixel_values.shape[0]
+                        )
+                else:
+                    # Multiple images: only use base patch (first patch) from each image
+                    # This matches HF behavior where multi-image doesn't use anyres
+                    base_patches = [pv[j][0] for j in range(pv_start, pv_end)]
+                    pixel_values = np.stack(base_patches, axis=0)  # (num_images, C, H, W)
+                    if self.max_num_patches is not None:
+                        pixel_values, grid_mask = self._pad_pixel_values(pixel_values, valid_patches=num_images)
+            else:
+                pixel_values = None
+
+            # Get image sizes for this example
+            if num_images > 0 and has_image_sizes:
+                assert img_sizes is not None  # Guarded by has_image_sizes
+                image_sizes = img_sizes[pv_start:pv_end]
+                if image_sizes.ndim == 1:
+                    image_sizes = image_sizes.reshape(1, 2)
+            else:
+                image_sizes = None
+
+            # Compute unpad_indices only for single-image anyres case
+            # Multi-image doesn't use anyres (each image is just 1 base patch)
+            if num_images == 1 and has_image_sizes and self.grid_pinpoints and self.vision_feature_height:
+                assert image_sizes is not None  # Guarded by has_image_sizes
+                orig_height, orig_width = int(image_sizes[0, 0]), int(image_sizes[0, 1])
+                gh, gw = self._compute_grid_shape((orig_height, orig_width))
+                patches_height = patches_width = self.vision_feature_height
+                features_per_patch = patches_height * patches_width
+                unpad_indices_raw = self._compute_unpad_indices_for_image(
+                    orig_height=orig_height,
+                    orig_width=orig_width,
+                    patches_height=patches_height,
+                    patches_width=patches_width,
+                    scale_height=gh,
+                    scale_width=gw,
+                    features_per_patch=features_per_patch,
+                )
+                # Pad unpad_indices to fixed size for consistent array shapes
+                if self.max_num_patches is not None:
+                    max_features = (self.max_num_patches + 1) * features_per_patch
+                    unpad_indices = np.zeros(max_features, dtype=np.int32)
+                    unpad_indices[: len(unpad_indices_raw)] = unpad_indices_raw
+                else:
+                    unpad_indices = unpad_indices_raw
+
+            # Create labels and build result
+            result: ImageTextDict = {
+                "pixel_values": pixel_values,
+                "input_ids": input_ids,
+                "attention_mask": attention_mask_batch[i],
+                "image_sizes": image_sizes,
+                "loss_mask": self._create_loss_mask(input_ids),
+                "grid_mask": grid_mask,
+                "unpad_indices": unpad_indices,
+            }
+            out.append(result)
+
+        return out
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        return {
+            "processor": type(self.processor).__name__,
+            "max_length": self.max_length,
+            "padding": self.padding,
+            "mask_prompt": self.mask_prompt,
+        }
+
+    @property
+    def output_exemplar(self):
+        exemplar = dict(ImageTextDict_exemplar)
+        # Override with sized arrays when max_num_patches is configured
+        if self.max_num_patches is not None:
+            total_patches = self.max_num_patches + 1
+            # Fixed-size pixel_values for cache schema
+            exemplar["pixel_values"] = np.zeros((total_patches, 3, self.patch_size, self.patch_size), dtype=np.float32)
+            exemplar["grid_mask"] = np.zeros((total_patches,), dtype=np.bool_)
+            # Include sized unpad_indices when vision_feature_height is also configured
+            if self.vision_feature_height is not None:
+                features_per_patch = self.vision_feature_height * self.vision_feature_height
+                max_features = (self.max_num_patches + 1) * features_per_patch
+                exemplar["unpad_indices"] = np.zeros((max_features,), dtype=np.int32)
+        return exemplar
+
+    @property
+    def num_cpus(self) -> int:
+        return 2  # Image processing can benefit from multiple CPUs
+
+    @property
+    def num_gpus(self) -> int:
+        return 0
+
+
+@dataclass
+class ImageDatasetSourceConfig:
+    """Configuration for a simple image-text dataset source (single image + text pairs)."""
+
+    id: Optional[str] = None  # HuggingFace dataset id or path
+    name: Optional[str] = None  # Dataset configuration name
+
+    stream: bool = True  # Whether to use streaming
+    image_key: str = "image"  # Key for image field
+    text_key: str = "text"  # Key for text field
+
+    train_split: str = "train"
+    validation_split: str = "validation"
+    train_urls: List[str] = ()  # type: ignore
+    validation_urls: List[str] = ()  # type: ignore
+    cache_dir: str = "cache/"
+
+    def get_shard_source(self, split: str) -> Optional[ShardedDataSource[Dict[str, Any]]]:
+        """Get a sharded data source for the specified split."""
+        if self.id is not None:
+            try:
+                ds = WrappedHFDataSource(self.id, split=split, name=self.name, streaming=self.stream)
+            except ValueError as e:
+                if str(e).startswith("Bad split"):
+                    logger.warning(f"Split {split} not found for {self.id} {self.name}")
+                    return None
+                raise
+
+            if len(ds.shard_names) == 0:
+                return None
+
+            def extract_fields(x):
+                return {
+                    "image": x[self.image_key],
+                    "text": x[self.text_key],
+                }
+
+            return ds.map(extract_fields)
+        else:
+            split_urls = self.urls_for_split(split)
+            if len(split_urls) == 0:
+                return None
+            return ImageTextUrlDataSource(split_urls, image_key=self.image_key, text_key=self.text_key)
+
+    def doc_iterator(self, split: str) -> Iterator[Dict[str, Any]]:
+        """Iterate over documents in the specified split."""
+        if self.id is not None:
+            data = datasets.load_dataset(self.id, split=split, name=self.name, streaming=self.stream)
+            for doc in data:
+                yield {
+                    "image": doc[self.image_key],
+                    "text": doc[self.text_key],
+                }
+        else:
+            urls = self.urls_for_split(split)
+            yield from ImageTextUrlDataSource(urls, image_key=self.image_key, text_key=self.text_key)
+
+    def urls_for_split(self, split: str) -> List[str]:
+        """Get URLs for the specified split.
+
+        Supports:
+        - Single file paths: /path/to/file.parquet
+        - Glob patterns: /path/to/*.parquet
+        - Directories: /path/to/folder/ (will find all *.parquet files)
+        - file:// prefixed paths: file:///path/to/folder/
+        - Brace expansion: /path/to/{train,val}*.parquet
+        """
+        if split == "train":
+            urls = self.train_urls
+        elif split == "validation":
+            urls = self.validation_urls
+        else:
+            raise ValueError(f"Unknown split: {split}")
+
+        return expand_urls_with_folder_support(list(urls))
+
+
+@dataclass
+class ConversationDatasetSourceConfig:
+    """Configuration for a conversation-format image-text dataset source.
+
+    This is used for VLM training data with conversation format like LLaVA.
+    Supports single image, multiple images, and interleaved image/text content.
+
+    1. Single image:
+    {
+        "messages": [
+            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]},
+            {"role": "assistant", "content": [{"type": "text", "text": "..."}]}
+        ],
+        "images": ["path/to/image.jpg"]
+    }
+
+    2. Multiple images:
+    {
+        "messages": [
+            {"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": "Compare these two images."}]},
+            {"role": "assistant", "content": [{"type": "text", "text": "..."}]}
+        ],
+        "images": ["path/to/image1.jpg", "path/to/image2.jpg"]
+    }
+
+    3. Interleaved image and text:
+    {
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": "First image:"},
+                {"type": "image"},
+                {"type": "text", "text": "Second image:"},
+                {"type": "image"},
+                {"type": "text", "text": "What are the differences?"}
+            ]},
+            {"role": "assistant", "content": [{"type": "text", "text": "..."}]}
+        ],
+        "images": ["path/to/image1.jpg", "path/to/image2.jpg"]
+    }
+
+    Note: {"type": "image"} placeholders are replaced with images from the "images" list in order.
+    """
+
+    id: Optional[str] = None  # HuggingFace dataset id or path
+    name: Optional[str] = None  # Dataset configuration name
+
+    stream: bool = True  # Whether to use streaming
+    messages_key: str = "messages"  # Key for messages field
+    images_key: str = "images"  # Key for images field
+
+    train_split: str = "train"
+    validation_split: str = "validation"
+    train_urls: List[str] = ()  # type: ignore
+    validation_urls: List[str] = ()  # type: ignore
+    cache_dir: str = "cache/"
+
+    def get_shard_source(self, split: str) -> Optional[ShardedDataSource[ConversationDict]]:
+        """Get a sharded data source for the specified split."""
+        if self.id is not None:
+            try:
+                ds = WrappedHFDataSource(self.id, split=split, name=self.name, streaming=self.stream)
+            except ValueError as e:
+                if str(e).startswith("Bad split"):
+                    logger.warning(f"Split {split} not found for {self.id} {self.name}")
+                    return None
+                raise
+
+            if len(ds.shard_names) == 0:
+                return None
+
+            def extract_fields(x):
+                return {
+                    "messages": x[self.messages_key],
+                    "images": x.get(self.images_key, []),
+                }
+
+            return ds.map(extract_fields)
+        else:
+            split_urls = self.urls_for_split(split)
+            if len(split_urls) == 0:
+                return None
+            return cast(
+                ShardedDataSource[ConversationDict],
+                ImageConversationUrlDataSource(split_urls, messages_key=self.messages_key, images_key=self.images_key),
+            )
+
+    def doc_iterator(self, split: str) -> Iterator[ConversationDict]:
+        """Iterate over documents in the specified split."""
+        if self.id is not None:
+            data = datasets.load_dataset(self.id, split=split, name=self.name, streaming=self.stream)
+            for doc in data:
+                yield {
+                    "messages": doc[self.messages_key],
+                    "images": doc.get(self.images_key, []),
+                }
+        else:
+            urls = self.urls_for_split(split)
+            for doc in ImageConversationUrlDataSource(
+                urls, messages_key=self.messages_key, images_key=self.images_key
+            ):
+                yield cast(ConversationDict, doc)
+
+    def urls_for_split(self, split: str) -> List[str]:
+        """Get URLs for the specified split.
+
+        Supports:
+        - Single file paths: /path/to/file.parquet
+        - Glob patterns: /path/to/*.parquet
+        - Directories: /path/to/folder/ (will find all *.parquet files)
+        - file:// prefixed paths: file:///path/to/folder/
+        - Brace expansion: /path/to/{train,val}*.parquet
+        """
+        if split == "train":
+            urls = self.train_urls
+        elif split == "validation":
+            urls = self.validation_urls
+        else:
+            raise ValueError(f"Unknown split: {split}")
+
+        return expand_urls_with_folder_support(list(urls))
+
+
+@dataclass
+class ImageTaskConfig(abc.ABC):
+    """Base configuration for image-text tasks."""
+
+    processor: str = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+    max_length: int = 2048
+    padding: bool = True
+
+    @cached_property
+    def the_processor(self) -> ProcessorMixin:
+        return load_processor(self.processor)
+
+    @cached_property
+    def pad_token_id(self) -> int:
+        return self.the_processor.tokenizer.pad_token_id
+
+    @cached_property
+    def the_tokenizer(self) -> PreTrainedTokenizerBase:
+        return self.the_processor.tokenizer
+
+    @abc.abstractmethod
+    def train_set(
+        self,
+        options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
+    ) -> AsyncDataset[ImageTextDict]:
+        pass
+
+    @abc.abstractmethod
+    def validation_sets(self) -> Mapping[str, AsyncDataset[ImageTextDict]]:
+        pass
+
+
+class StreamingImageDataset(AsyncDataset[ImageTextDict]):
+    """
+    Streaming dataset that processes images on-the-fly without caching to disk.
+
+    This avoids the disk space overhead of caching preprocessed pixel_values,
+    at the cost of reprocessing images each epoch.
+
+    Key design:
+    - Loads ALL raw data into memory at startup (raw data is small - just text/metadata)
+    - Background thread prefetches data sequentially ahead of consumption
+    - Prefetch cache: items are removed when accessed, freeing space for more prefetch
+    - Uses per-processor locks for HF tokenizer thread-safety
+
+    Flow:
+        Prefetch thread: [process 0-31] -> [process 32-63] -> [process 64-95] -> ...
+        Main thread:     [access 0-31, pop from cache] -> [access 32-63, pop] -> ...
+    """
+
+    # How many processed examples to cache in memory
+    DEFAULT_CACHE_SIZE = 256  # ~256 examples * ~2MB each = ~512MB
+
+    # Per-processor locks - each processor instance gets its own lock
+    # This allows different processors to run in parallel while ensuring
+    # thread-safety for each individual processor (HF tokenizers are not thread-safe)
+    _processor_locks: Optional[weakref.WeakKeyDictionary] = None  # Lazy init
+    _processor_locks_lock: Optional[threading.Lock] = None  # Lazy init
+
+    @classmethod
+    def _init_class_locks(cls):
+        """Initialize class-level locks lazily."""
+        if cls._processor_locks is None:
+            cls._processor_locks = weakref.WeakKeyDictionary()
+            cls._processor_locks_lock = threading.Lock()
+
+    @classmethod
+    def _get_processor_lock(cls, processor) -> threading.Lock:
+        """Get or create a lock for a specific processor instance."""
+        cls._init_class_locks()
+        assert cls._processor_locks_lock is not None
+        assert cls._processor_locks is not None
+        with cls._processor_locks_lock:
+            if processor not in cls._processor_locks:
+                cls._processor_locks[processor] = threading.Lock()
+            return cls._processor_locks[processor]
+
+    def __init__(
+        self,
+        source: ShardedDataSource[Dict[str, Any]],
+        processor: ProcessorMixin,
+        max_length: int = 2048,
+        padding: bool = True,
+        messages_key: str = "messages",
+        images_key: str = "images",
+        cache_size: int = DEFAULT_CACHE_SIZE,
+        max_num_patches: Optional[int] = None,
+    ):
+        super().__init__()
+        self.source = source
+        self.processor = processor
+        self.max_length = max_length
+        self.padding = padding
+        self.messages_key = messages_key
+        self.images_key = images_key
+        self.cache_size = cache_size
+
+        # Extract grid_pinpoints and related params from processor for anyres support
+        grid_pinpoints, patch_size, vision_feature_height, extracted_max_num_patches = _extract_anyres_params(
+            processor
+        )
+        # Use passed max_num_patches if provided, otherwise use extracted value
+        if max_num_patches is not None:
+            final_max_num_patches = max_num_patches
+        else:
+            final_max_num_patches = extracted_max_num_patches
+
+        # Build the batch processor (runs on CPU in background thread)
+        self._batch_processor = BatchImageProcessor(
+            processor,
+            max_length=max_length,
+            padding=padding,
+            messages_key=messages_key,
+            images_key=images_key,
+            grid_pinpoints=grid_pinpoints,
+            patch_size=patch_size,
+            vision_feature_height=vision_feature_height,
+            max_num_patches=final_max_num_patches,
+        )
+
+        # Use per-processor lock - HuggingFace tokenizer is NOT thread-safe
+        # Each processor instance gets its own lock, allowing different processors
+        # to run in parallel while ensuring thread-safety for each one
+        self._processor_lock = self._get_processor_lock(processor)
+
+        # RAW data stored in memory (small - just text/paths, not images)
+        # This avoids slow re-reading of jsonl files
+        self._raw_data: Optional[List[Dict[str, Any]]] = None
+        self._length: Optional[int] = None
+        self._data_lock = threading.Lock()
+        self._data_loaded = threading.Event()
+
+        # Prefetch cache for PROCESSED data (large - includes pixel_values)
+        # Key: global_idx, Value: ImageTextDict
+        # Items are popped when accessed - cache only holds prefetched but not-yet-accessed data
+        self._processed_cache: OrderedDict[int, ImageTextDict] = OrderedDict()
+        self._cache_lock = threading.Lock()
+
+        # Background sequential prefetch
+        self._prefetch_thread: Optional[threading.Thread] = None
+        self._stop_prefetch = threading.Event()
+
+    def _ensure_data_loaded(self):
+        """Load all raw data into memory (synchronous)."""
+        if self._raw_data is not None:
+            return
+
+        with self._data_lock:
+            if self._raw_data is not None:
+                return
+
+            logger.info("Loading raw data into memory for streaming...")
+
+            # Pre-allocate list and use list extend for better performance
+            raw_data: list[Dict[str, Any]] = []
+            for shard_name in self.source.shard_names:
+                # Use list extend instead of individual appends
+                shard_data = list(self.source.open_shard(shard_name))
+                raw_data.extend(shard_data)
+
+            self._raw_data = raw_data
+            self._length = len(raw_data)
+            self._data_loaded.set()
+            logger.info(f"Loaded {self._length} raw examples into memory")
+
+            # Start background prefetch thread
+            self._start_prefetch_thread()
+
+    def _start_prefetch_thread(self):
+        """Start background thread to prefetch data sequentially."""
+        if self._prefetch_thread is not None:
+            return
+
+        def prefetch_worker():
+            """Background worker that prefetches data sequentially.
+
+            Simple sequential prefetch - processes data from index 0 to end,
+            keeping the cache filled ahead of consumption.
+            """
+            batch_size = 32
+            next_idx = 0
+
+            while not self._stop_prefetch.is_set():
+                if self._length is None or self._raw_data is None:
+                    self._stop_prefetch.wait(0.05)
+                    continue
+
+                # Check cache size
+                cache_len = len(self._processed_cache)
+                if cache_len >= self.cache_size:
+                    # Cache is full, wait
+                    self._stop_prefetch.wait(0.05)
+                    continue
+
+                # Wrap around for epoch support
+                if next_idx >= self._length:
+                    next_idx = 0
+
+                # Find indices not in cache
+                end_idx = min(next_idx + batch_size, self._length)
+                with self._cache_lock:
+                    indices_to_prefetch = [i for i in range(next_idx, end_idx) if i not in self._processed_cache]
+
+                if not indices_to_prefetch:
+                    next_idx = end_idx
+                    continue
+
+                # Process batch
+                try:
+                    raw_items = [self._raw_data[i] for i in indices_to_prefetch]
+                    with self._processor_lock:
+                        processed = self._batch_processor(raw_items)
+
+                    with self._cache_lock:
+                        for idx, item in zip(indices_to_prefetch, processed):
+                            self._processed_cache[idx] = item
+                        # Evict oldest entries if over limit
+                        while len(self._processed_cache) > self.cache_size:
+                            self._processed_cache.popitem(last=False)
+                except Exception as e:
+                    logger.warning(f"Prefetch failed for indices {indices_to_prefetch}: {e}")
+
+                next_idx = end_idx
+
+        self._prefetch_thread = threading.Thread(target=prefetch_worker, daemon=True)
+        self._prefetch_thread.start()
+        logger.debug("Started sequential prefetch thread")
+
+    def _process_items(self, indices: Sequence[int]) -> List[ImageTextDict]:
+        """Process items - must be called with _processor_lock held."""
+        assert self._raw_data is not None, "Data not loaded"
+        raw_items = [self._raw_data[i] for i in indices]
+        processed = self._batch_processor(raw_items)
+        return list(processed)
+
+    def _get_from_cache_or_process(self, indices: Sequence[int]) -> List[ImageTextDict]:
+        """Get items from cache or process them.
+
+        Strategy: Remove accessed items from cache immediately to free space for prefetch.
+        This ensures the background prefetch thread can always work ahead.
+        """
+        self._ensure_data_loaded()
+
+        results: List[Optional[ImageTextDict]] = [None] * len(indices)
+        indices_to_process: List[Tuple[int, int]] = []  # (result_idx, global_idx)
+
+        # Check cache and pop accessed items (they won't be needed again soon)
+        with self._cache_lock:
+            for result_idx, global_idx in enumerate(indices):
+                if global_idx in self._processed_cache:
+                    # Pop from cache - accessed data won't be reused in sequential access
+                    results[result_idx] = self._processed_cache.pop(global_idx)
+                else:
+                    indices_to_process.append((result_idx, global_idx))
+
+        # Process missing items outside of cache lock
+        if indices_to_process:
+            global_indices = [gidx for _, gidx in indices_to_process]
+
+            # Get raw items without lock (read-only access to _raw_data)
+            assert self._raw_data is not None, "Data not loaded"
+            raw_items = [self._raw_data[i] for i in global_indices]
+
+            # Only hold processor lock during actual processing
+            with self._processor_lock:
+                processed = self._batch_processor(raw_items)
+
+            # Store results (no need to cache since we just processed on-demand)
+            for (result_idx, _), item in zip(indices_to_process, processed):
+                results[result_idx] = item
+
+        return results  # type: ignore
+
+    async def async_len(self) -> int:
+        self._ensure_data_loaded()
+        assert self._length is not None, "Data not loaded"
+        return self._length
+
+    async def final_length_is_known(self) -> bool:
+        self._ensure_data_loaded()
+        return True
+
+    def is_finite(self) -> bool:
+        return True
+
+    async def current_len(self) -> Optional[int]:
+        self._ensure_data_loaded()
+        return self._length
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[ImageTextDict]:
+        """Get a batch of processed items."""
+        # Run in thread pool to not block event loop
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._get_from_cache_or_process, indices)
+
+    def __del__(self):
+        """Clean up background thread."""
+        self._stop_prefetch.set()
+        if self._prefetch_thread is not None:
+            self._prefetch_thread.join(timeout=1.0)
+
+    @staticmethod
+    def build(
+        source: ShardedDataSource[Dict[str, Any]],
+        processor: ProcessorMixin,
+        max_length: int = 2048,
+        padding: bool = True,
+        messages_key: str = "messages",
+        images_key: str = "images",
+        cache_size: int = DEFAULT_CACHE_SIZE,
+        max_num_patches: Optional[int] = None,
+    ) -> "StreamingImageDataset":
+        """Build a streaming dataset from a source."""
+        return StreamingImageDataset(
+            source=source,
+            processor=processor,
+            max_length=max_length,
+            padding=padding,
+            messages_key=messages_key,
+            images_key=images_key,
+            cache_size=cache_size,
+            max_num_patches=max_num_patches,
+        )
+
+
+class ProcessedImageCache(AsyncDataset[ImageTextDict]):
+    """
+    Cache for preprocessed image-text data.
+    """
+
+    def __init__(self, cache: TreeCache[ImageTextDict]):
+        super().__init__()
+        self.cache = cache
+
+    async def async_len(self) -> int:
+        return await self.cache.async_len()
+
+    async def final_length_is_known(self) -> bool:
+        return await self.cache.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return self.cache.is_finite()
+
+    async def current_len(self) -> Optional[int]:
+        return await self.cache.current_len()
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[ImageTextDict]:
+        return await self.cache.get_batch(indices)
+
+    @staticmethod
+    def build_or_load(
+        cache_dir: str,
+        source: ShardedDataSource[Dict[str, Any]],
+        processor: ProcessorMixin,
+        max_length: int = 2048,
+        padding: bool = True,
+        messages_key: str = "messages",
+        images_key: str = "images",
+        cache_options: CacheOptions = CacheOptions.default(),
+        split: str = "",
+    ) -> "ProcessedImageCache":
+        # Extract grid_pinpoints and related params from processor for anyres support
+        grid_pinpoints, patch_size, vision_feature_height, max_num_patches = _extract_anyres_params(processor)
+
+        bp = BatchImageProcessor(
+            processor,
+            max_length=max_length,
+            padding=padding,
+            messages_key=messages_key,
+            images_key=images_key,
+            grid_pinpoints=grid_pinpoints,
+            patch_size=patch_size,
+            vision_feature_height=vision_feature_height,
+            max_num_patches=max_num_patches,
+        )
+        cache = build_or_load_cache(cache_dir, source, bp, options=cache_options)
+
+        if cache.is_finished:
+            logger.info(f"Cache {cache_dir} is complete.")
+        else:
+            logger.info(f"Cache {cache_dir} is incomplete. Blocking until at least one chunk is complete.")
+
+        return ProcessedImageCache(cache)
+
+    @staticmethod
+    def load(cache_dir: str) -> "ProcessedImageCache":
+        """Load a ProcessedImageCache from a directory."""
+        try:
+            cache = TreeCache.load(cache_dir, ImageTextDict_exemplar, options=None)
+            return ProcessedImageCache(cache)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"{cache_dir} is not a complete cache")
+        except Exception:
+            logger.exception("Error loading cache")
+            raise
+
+
+@dataclass
+class ImageIODatasetConfig(ImageDatasetSourceConfig, ImageTaskConfig):
+    """Configuration for loading image-text data from HuggingFace or URLs."""
+
+    def train_set(
+        self,
+        options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
+    ) -> ProcessedImageCache:
+        ds = self.build_or_load_cache(self.train_split, options)
+        if ds is None:
+            raise ValueError("No training set!")
+        return ds
+
+    def validation_set(self) -> Optional[ProcessedImageCache]:
+        return self.build_or_load_cache(self.validation_split)
+
+    def validation_sets(self) -> Mapping[str, ProcessedImageCache]:
+        if self._has_validation_set:
+            validation_set = self.validation_set()
+            if validation_set is not None:
+                return {"": validation_set}
+        return {}
+
+    @cached_property
+    def _has_validation_set(self) -> bool:
+        if len(self.validation_urls) > 0:
+            return True
+
+        if self.id is not None:
+            try:
+                dataset = datasets.load_dataset(
+                    self.id, name=self.name, streaming=self.stream, split=self.validation_split
+                )
+                next(iter(dataset))
+                return True
+            except StopIteration:
+                return False
+
+        return False
+
+    def build_or_load_cache(
+        self,
+        split: str,
+        cache_options: CacheOptions = CacheOptions.default(),
+    ) -> Optional[ProcessedImageCache]:
+        split_cache_dir = os.path.join(self.cache_dir, split)
+
+        try:
+            return ProcessedImageCache.load(split_cache_dir)
+        except FileNotFoundError:
+            pass
+
+        source = self.get_shard_source(split)
+        if source is None:
+            logger.info(f"No data for {split}")
+            return None
+
+        logger.info(f"Building cache for {split}...")
+
+        # For simple image-text pairs, we need to convert to conversation format
+        # The BatchImageProcessor expects messages_key and images_key
+        return ProcessedImageCache.build_or_load(
+            split_cache_dir,
+            source,
+            self.the_processor,
+            max_length=self.max_length,
+            padding=self.padding,
+            messages_key="messages",  # Will be created by source mapping
+            images_key="images",
+            cache_options=cache_options,
+        )
+
+
+@dataclass
+class ConversationIODatasetConfig(ConversationDatasetSourceConfig, ImageTaskConfig):
+    """Configuration for loading conversation-format image-text data from HuggingFace or URLs."""
+
+    def train_set(
+        self,
+        options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
+    ) -> ProcessedImageCache:
+        ds = self.build_or_load_cache(self.train_split, options)
+        if ds is None:
+            raise ValueError("No training set!")
+        return ds
+
+    def validation_set(self) -> Optional[ProcessedImageCache]:
+        return self.build_or_load_cache(self.validation_split)
+
+    def validation_sets(self) -> Mapping[str, ProcessedImageCache]:
+        if self._has_validation_set:
+            validation_set = self.validation_set()
+            if validation_set is not None:
+                return {"": validation_set}
+        return {}
+
+    @cached_property
+    def _has_validation_set(self) -> bool:
+        if len(self.validation_urls) > 0:
+            return True
+
+        if self.id is not None:
+            try:
+                dataset = datasets.load_dataset(
+                    self.id, name=self.name, streaming=self.stream, split=self.validation_split
+                )
+                next(iter(dataset))
+                return True
+            except StopIteration:
+                return False
+
+        return False
+
+    def build_or_load_cache(
+        self,
+        split: str,
+        cache_options: CacheOptions = CacheOptions.default(),
+    ) -> Optional[ProcessedImageCache]:
+        split_cache_dir = os.path.join(self.cache_dir, split)
+
+        try:
+            return ProcessedImageCache.load(split_cache_dir)
+        except FileNotFoundError:
+            pass
+
+        source = self.get_shard_source(split)
+        if source is None:
+            logger.info(f"No data for {split}")
+            return None
+
+        logger.info(f"Building cache for {split}...")
+
+        return ProcessedImageCache.build_or_load(
+            split_cache_dir,
+            source,
+            self.the_processor,
+            max_length=self.max_length,
+            padding=self.padding,
+            messages_key=self.messages_key,
+            images_key=self.images_key,
+            cache_options=cache_options,
+        )
+
+
+class ImageTextExample(eqx.Module):
+    """Example for vision-language model training/inference.
+
+    Supports both image+text and text-only examples.
+    For text-only, pixel_values is None and grid_mask is None.
+
+    Uses fixed-shape processing for JIT compatibility:
+    - pixel_values are padded to TOTAL_PATCHES = max_patches + 1
+    - grid_mask indicates which patches are valid (True) vs padding (False)
+    """
+
+    pixel_values: Optional[NamedArray]  # (TOTAL_PATCHES, channels, height, width) - FIXED shape, padded
+    input_ids: NamedArray  # (position,)
+    loss_mask: Optional[NamedArray] = None  # (position,) - mask for loss computation (1.0 for valid, 0.0 for masked)
+    # Boolean mask indicating valid patches (True for actual, False for padding)
+    # Shape: (TOTAL_PATCHES,) where TOTAL_PATCHES = max_patches + 1
+    grid_mask: Optional[NamedArray] = None
+    # Pre-computed indices to reorder features to HF's unpadded order
+    # Shape: (num_image_tokens,) - maps HF position to Levanter index
+    unpad_indices: Optional[NamedArray] = None
+
+    @staticmethod
+    def init(
+        pixel_values: Optional[NamedArray],
+        input_ids: NamedArray,
+        loss_mask: Optional[NamedArray] = None,
+        grid_mask: Optional[NamedArray] = None,
+    ) -> "ImageTextExample":
+        """Initialize an ImageTextExample with optional loss masking.
+
+        Args:
+            pixel_values: Image pixel values (FIXED shape, padded), or None for text-only
+            input_ids: Token IDs
+            loss_mask: Loss mask (float32) with 1.0 for valid tokens, 0.0 for masked.
+            grid_mask: Boolean mask indicating valid patches (TOTAL_PATCHES,)
+        """
+        result_loss_mask = None
+        if loss_mask is not None:
+            # Ensure float32 dtype for loss computation
+            mask_array = loss_mask.array if hasattr(loss_mask, "array") else loss_mask
+            if mask_array.dtype != np.float32:
+                mask_array = mask_array.astype(np.float32)
+            result_loss_mask = NamedArray(mask_array, loss_mask.axes)
+
+        return ImageTextExample(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            loss_mask=result_loss_mask,
+            grid_mask=grid_mask,
+        )
+
+
+class ImageTextDataset(MappedAsyncDataset[ImageTextDict, ImageTextExample]):
+    """Dataset that converts ImageTextDict to ImageTextExample with proper axes."""
+
+    def __init__(
+        self,
+        dataset: AsyncDataset[ImageTextDict],
+        Position: Axis,
+        NumPatches: Axis,
+        Channels: Axis,
+        Height: Axis,
+        Width: Axis,
+        key: Optional[PRNGKeyArray] = None,
+        pixel_dtype: Optional[np.dtype] = None,
+        grid_pinpoints: Optional[List[List[int]]] = None,
+        patch_size: int = 384,
+    ):
+        """
+        Args:
+            dataset: Source dataset providing ImageTextDict
+            Position: Axis for sequence position
+            NumPatches: Axis for number of image patches
+            Channels: Axis for image channels
+            Height: Axis for image height
+            Width: Axis for image width
+            key: Optional random key
+            pixel_dtype: dtype for pixel values when moving to device.
+                        If None, uses the original dtype (float32).
+                        Set to jnp.bfloat16 to save memory on TPU.
+            grid_pinpoints: List of grid resolutions for anyres processing.
+            patch_size: Size of each image patch (default 384).
+        """
+        self.dataset = dataset
+        self.Position = Position
+        self.NumPatches = NumPatches
+        self.Channels = Channels
+        self.Height = Height
+        self.Width = Width
+        self.key = key
+        self.pixel_dtype = pixel_dtype
+        self.grid_pinpoints = grid_pinpoints
+        self.patch_size = patch_size
+
+        # Process on CPU with numpy, avoid jnp.asarray() which would allocate on TPU
+        # Use NamedArray constructor directly instead of hax.named() to keep data as numpy
+        # DataLoader will handle the conversion to JAX arrays during batching
+        def _convert_example(inputs: ImageTextDict) -> ImageTextExample:
+            # All processing on CPU with numpy
+            pv = inputs.get("pixel_values")
+
+            # Handle text-only examples (pixel_values is None)
+            if pv is None:
+                pixel_values = None
+            elif pv.ndim == 4:
+                # (num_patches, channels, height, width)
+                actual_num_patches = pv.shape[0]
+                target_num_patches = self.NumPatches.size
+
+                if actual_num_patches < target_num_patches:
+                    # Pad with numpy (CPU)
+                    pad_size = target_num_patches - actual_num_patches
+                    padding = np.zeros((pad_size,) + pv.shape[1:], dtype=pv.dtype)
+                    pv = np.concatenate([pv, padding], axis=0)
+                elif actual_num_patches > target_num_patches:
+                    pv = pv[:target_num_patches]
+
+                # Convert to target dtype if specified (e.g., bfloat16 for TPU)
+                # Use numpy for dtype conversion to keep data on CPU
+                if self.pixel_dtype is not None:
+                    np_dtype = np.dtype(self.pixel_dtype)
+                    pv = pv.astype(np_dtype)
+
+                # Use NamedArray directly to avoid jnp.asarray() in hax.named()
+                # This keeps data as numpy array until DataLoader batches it
+                pixel_values = NamedArray(pv, (self.NumPatches, self.Channels, self.Height, self.Width))
+            elif pv.ndim == 3:
+                if self.pixel_dtype is not None:
+                    np_dtype = np.dtype(self.pixel_dtype)
+                    pv = pv.astype(np_dtype)
+                pixel_values = NamedArray(pv, (self.Channels, self.Height, self.Width))
+            else:
+                raise ValueError(f"Unexpected pixel_values shape: {pv.shape}")
+
+            # Keep input_ids as numpy array
+            input_ids = NamedArray(inputs["input_ids"], (self.Position,))
+
+            loss_mask = None
+            if "loss_mask" in inputs:
+                loss_mask = NamedArray(inputs["loss_mask"], (self.Position,))
+
+            # Extract grid_mask from preprocessing (for fixed-shape processing)
+            gm_arr = inputs.get("grid_mask")
+            if gm_arr is not None:
+                # Create NamedArray for grid_mask
+                NumPatches = Axis("num_patches", gm_arr.shape[0])
+                grid_mask = NamedArray(gm_arr, (NumPatches,))
+            else:
+                grid_mask = None
+
+            out = ImageTextExample.init(
+                pixel_values,
+                input_ids,
+                loss_mask=loss_mask,
+                grid_mask=grid_mask,
+            )
+            return out
+
+        super().__init__(self.dataset, _convert_example)
+
+
+@dataclass
+class ImageMixtureDatasetConfig(ImageTaskConfig):
+    """Configuration for a mixture of image-text datasets with their associated weights.
+
+    This class supports mixing multiple image-text data sources for training,
+    similar to AudioMixtureDatasetConfig for audio data.
+
+    Example:
+        config = ImageMixtureDatasetConfig(
+            cache_dir="cache/",
+            configs={
+                "coco": ImageDatasetSourceConfig(id="coco-dataset", ...),
+                "llava": ConversationDatasetSourceConfig(id="llava-dataset", ...),
+            },
+            train_weights={"coco": 0.3, "llava": 0.7},
+        )
+    """
+
+    cache_dir: Optional[str] = "cache/"
+
+    # Data source configs and weights
+    configs: Dict[str, Union[ImageDatasetSourceConfig, ConversationDatasetSourceConfig]] = field(default_factory=dict)
+    """Configuration of each dataset source (URLs, HF dataset ID, etc.)"""
+    train_weights: Dict[str, float] = field(default_factory=dict)
+    """Weights for each dataset source. They will be normalized to sum to 1."""
+    shuffle: bool | int = False
+    """Whether to shuffle the dataset. True means shuffle the whole dataset, False means don't shuffle.
+    If you want to shuffle in eras, set this to the era length."""
+    stop_strategy: str = field(default=StopStrategy.RESTART_STRATEGY)
+    mixture_block_size: int = 2048
+    """Block size for the mixture dataset."""
+    use_cache: bool = True
+    """Whether to cache preprocessed data. Set to False for streaming mode (saves disk space)."""
+
+    def __post_init__(self):
+        if len(self.configs) == 0:
+            raise ValueError("At least one dataset must be provided")
+
+        if set(self.configs.keys()) != set(self.train_weights.keys()):
+            raise ValueError(
+                f"The keys in configs and weights must be the same; got {self.configs.keys()} and"
+                f" {self.train_weights.keys()}"
+            )
+
+    def train_set(
+        self,
+        options: CacheOptions = CacheOptions.default(),
+        *,
+        key: Optional[PRNGKeyArray] = None,
+        epochs: Optional[int] = None,
+        max_num_patches: Optional[int] = None,
+    ) -> AsyncDataset[ImageTextDict]:
+        image_datasets = self.training_sets(max_num_patches=max_num_patches)
+
+        if key is None:
+            key = jax.random.PRNGKey(0)
+
+        mix_key, shuffle_key = jax.random.split(key)
+
+        # Shuffle components, not the overall mixture, to preserve "stable batch" property
+        def shuffle_ds(ds, key):
+            if self.shuffle is True:
+                ds = ds.shuffle(key)
+            elif isinstance(self.shuffle, int):
+                ds = ds.era_shuffle(self.shuffle, key=key)
+            return ds
+
+        if self.shuffle:
+            out_datasets = {}
+            key_iter = key_iterator(shuffle_key)
+            for name, ds in image_datasets.items():
+                out_datasets[name] = shuffle_ds(ds, next(key_iter))
+            image_datasets = out_datasets
+
+        # Wrap each dataset in EpochDataset if epochs is specified and > 0
+        # This is applied before mixing so each dataset cycles for the specified epochs
+        if epochs and epochs > 0:
+            logger.info(f"Wrapping each dataset in EpochDataset with max_epochs={epochs}")
+            epoch_wrapped_datasets = {}
+            for name, ds in image_datasets.items():
+                epoch_wrapped_datasets[name] = EpochDataset(ds, max_epochs=epochs)
+            image_datasets = epoch_wrapped_datasets
+
+        mixture = MixtureDataset(
+            datasets=image_datasets,
+            weights=self.train_weights,
+            stop_strategy=self.stop_strategy,
+            key=mix_key,
+            block_size=self.mixture_block_size,
+        )
+
+        return mixture
+
+    def training_sets(self, max_num_patches: Optional[int] = None) -> Mapping[str, AsyncDataset[ImageTextDict]]:
+        if self.use_cache:
+            return self.build_caches("train")
+        else:
+            return self.build_streaming_datasets("train", max_num_patches=max_num_patches)
+
+    def validation_sets(self, max_num_patches: Optional[int] = None) -> Mapping[str, AsyncDataset[ImageTextDict]]:
+        if self.use_cache:
+            return self.build_caches("validation")
+        else:
+            return self.build_streaming_datasets("validation", max_num_patches=max_num_patches)
+
+    def build_streaming_datasets(
+        self, split: str, max_num_patches: Optional[int] = None
+    ) -> Dict[str, StreamingImageDataset]:
+        """Build streaming datasets that process images on-the-fly without caching."""
+        datasets_dict = {}
+
+        # Use provided max_num_patches, otherwise try to extract from processor
+        if max_num_patches is None:
+            _, _, _, max_num_patches = _extract_anyres_params(self.the_processor)
+
+        for name, source_config in self.configs.items():
+            weight = self.train_weights.get(name, 0)
+
+            if weight == 0 and split == "train":
+                continue
+
+            # Get the shard source
+            if split == "train":
+                source = source_config.get_shard_source(source_config.train_split)
+            elif split == "validation":
+                source = source_config.get_shard_source(source_config.validation_split)
+            else:
+                source = source_config.get_shard_source(split)
+
+            if source is None:
+                logger.warning(f"Skipping {name} for split {split} because no source was provided")
+                continue
+
+            # Determine messages_key and images_key
+            if isinstance(source_config, ConversationDatasetSourceConfig):
+                messages_key = source_config.messages_key
+                images_key = source_config.images_key
+            else:
+                # For simple image-text pairs, the source already maps to messages/images format
+                messages_key = "messages"
+                images_key = "images"
+
+            # Build streaming dataset
+            streaming_ds = StreamingImageDataset.build(
+                source=source,
+                processor=self.the_processor,
+                max_length=self.max_length,
+                padding=self.padding,
+                messages_key=messages_key,
+                images_key=images_key,
+                max_num_patches=max_num_patches,
+            )
+
+            datasets_dict[name] = streaming_ds
+            # Get dataset size and log it
+            try:
+                dataset_len = asyncio.run(streaming_ds.async_len())
+                logger.info(f"Built streaming dataset for {name} ({split}): {dataset_len:,} datapoints")
+            except Exception:
+                logger.info(f"Built streaming dataset for {name} ({split})")
+
+        return datasets_dict
+
+    def build_caches(self, split: str) -> Dict[str, ProcessedImageCache]:
+        # Forward all "Task" config fields to the dataset config for building
+        task_config_fields = set(x.name for x in dataclasses.fields(ImageTaskConfig))
+        task_config_dict = {k: v for k, v in self.__dict__.items() if k in task_config_fields and k != "cache_dir"}
+
+        caches = {}
+        for name, source_config in self.configs.items():
+            weight = self.train_weights.get(name, 0)
+
+            if weight == 0 and split == "train":
+                continue
+
+            source_config_dict = dict(**source_config.__dict__)
+
+            if source_config.cache_dir is None:
+                # Replace with the main cache_dir/{name}
+                if self.cache_dir is None:
+                    raise ValueError(
+                        "If the 'main' cache_dir is None, then all component cache_dirs must be non-None, but"
+                        f" {name}'s cache_dir is None."
+                    )
+                cache_dir = os.path.join(self.cache_dir, name)
+                source_config_dict["cache_dir"] = cache_dir
+
+            # Choose the correct config class based on source config type
+            if isinstance(source_config, ConversationDatasetSourceConfig):
+                dataset = ConversationIODatasetConfig(
+                    **source_config_dict,
+                    **task_config_dict,
+                )
+            else:
+                dataset = ImageIODatasetConfig(
+                    **source_config_dict,
+                    **task_config_dict,
+                )
+
+            if split == "train":
+                cache = dataset.build_or_load_cache(dataset.train_split)
+            elif split == "validation":
+                cache = dataset.build_or_load_cache(dataset.validation_split)
+            else:
+                cache = dataset.build_or_load_cache(split)
+
+            # Drop the data source and corresponding weight if the cache is not built
+            if cache is None:
+                logger.warning(f"Skipping {name} for split {split} because no source was provided")
+            else:
+                caches[name] = cache
+                # Get cache size and log it
+                try:
+                    cache_len = asyncio.run(cache.async_len())
+                    logger.info(f"Built cache for {name} ({split}): {cache_len:,} datapoints")
+                except Exception:
+                    logger.info(f"Built cache for {name} ({split})")
+
+        return caches
+
+    @property
+    def sources(self) -> Mapping[str, Union[ImageDatasetSourceConfig, ConversationDatasetSourceConfig]]:
+        return self.configs
+
+
+# =============================================================================
+# LLaVA-OneVision Processor Classes
+# =============================================================================
+# Adapted from HuggingFace Transformers library:
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/processing_llava_onevision.py
+#
+# Original code copyright 2024 The HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0.
+#
+# We acknowledge the LLaVA-OneVision team for their excellent work:
+# https://github.com/LLaVA-VL/LLaVA-NeXT
+# Paper: https://arxiv.org/abs/2408.03326
+#
+# These classes provide custom processor implementation for LLaVA-OneVision models
+# with additional support for padding mode and fixed-shape processing.
+
+# Get a transformers logger for the processor
+_processor_logger = transformers_logging.get_logger(__name__)
+
+
+class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
+    # see processing_utils.ProcessingKwargs documentation for usage.
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+        "image_kwargs": {},
+    }
+
+
+class LlavaOnevisionProcessor(ProcessorMixin):
+    r"""
+    Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
+
+    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`LlavaOnevisionVideoProcessor`], *optional*):
+            The video processor is a required input.
+        num_image_tokens (`int`, *optional*):
+            Number of image tokens for one imagethat will be returned by vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
+            Aspect ratio used when processong image features. The default value is "anyres_max_9".
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LlavaOnevisionImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    video_processor_class = "LlavaOnevisionVideoProcessor"
+    optional_attributes = ["video_processor", "chat_template"]
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        num_image_tokens=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",
+        video_token="<video>",
+        vision_aspect_ratio="anyres_max_9",
+        max_image_tiles: Optional[int] = None,
+        **kwargs,
+    ):
+        self.num_image_tokens = num_image_tokens
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+        self.vision_aspect_ratio = vision_aspect_ratio
+
+        # For padding mode: max_image_tiles is the total number of tiles (including base)
+        # e.g., for anyres_max_9, max_image_tiles = 9 + 1 = 10
+        self.max_image_tiles = max_image_tiles
+        if max_image_tiles is not None and num_image_tokens is not None:
+            self.max_image_tokens = max_image_tiles * num_image_tokens
+        else:
+            self.max_image_tokens = None
+
+        super().__init__(image_processor, tokenizer, video_processor=video_processor, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos: Optional[VideoInput] = None,
+        padding_mode: bool = False,
+        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
+            - **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
+        """
+
+        output_kwargs = self._merge_kwargs(
+            LlavaOnevisionProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        image_inputs = video_inputs = {}
+
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+            batch_num_images = iter(image_inputs["batch_num_images"])
+            image_sizes = iter(image_inputs["image_sizes"])
+            height, width = get_image_size(
+                to_numpy_array(image_inputs["pixel_values"][0][0]),
+                channel_dim=output_kwargs["images_kwargs"].get("data_format"),
+            )
+            text, num_image_tokens = self._expand_image_tokens(
+                text,
+                image_sizes,
+                height,
+                width,
+                self.image_token,
+                batch_num_images,
+                padding_mode=padding_mode,
+            )
+
+        if videos is not None:
+            video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
+
+            one_video = video_inputs.get("pixel_values_videos")[0]
+            if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
+                one_video = np.array(one_video)
+            else:
+                one_video = to_numpy_array(one_video)
+            height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
+            num_frames = one_video.shape[0]  # frame dim is always after batch dim
+            patches_height_width = int(math.sqrt(self.num_image_tokens))
+            pooled_height_width = math.ceil(patches_height_width / 2)
+            num_video_tokens = num_frames * pooled_height_width * pooled_height_width
+            text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
+
+    def _expand_image_tokens(
+        self,
+        text: list[TextInput],
+        image_sizes: Iterable[Union[list[int], int]],
+        height: int,
+        width: int,
+        special_token: str,
+        batch_num_images: Iterable[int],
+        padding_mode: bool = False,
+    ):
+        prompt_strings = []
+        max_num_vision_tokens = 0
+        for sample in text:
+            if special_token in sample:
+                # Count actual number of image tokens in the sample
+                # batch_num_images may not be reliable for multi-image
+                num_images = sample.count(special_token)
+                _ = next(batch_num_images)  # consume iterator to stay in sync
+                is_multi_image = num_images != 1
+            else:
+                is_multi_image = False
+                num_images = 0
+            while special_token in sample:
+                original_size = next(image_sizes)  # should consume iterable
+
+                # In padding mode:
+                # - Multi-image: use base tokens only (729) - no anyres for multi-image
+                # - Single image: use max tokens (7290) for JIT compatibility
+                if padding_mode and self.max_image_tokens is not None:
+                    if is_multi_image:
+                        num_image_tokens = self.num_image_tokens  # Base patch only
+                    else:
+                        num_image_tokens = self.max_image_tokens  # Full anyres
+                elif is_multi_image:
+                    num_image_tokens = self.num_image_tokens
+                else:
+                    if not isinstance(original_size, (list, tuple)):
+                        # cast to list to avoid numerical precision errors when calculating unpadding
+                        original_size = original_size.tolist()
+                    orig_height, orig_width = original_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+
+                assert num_image_tokens is not None  # Always assigned in branches above
+                max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+
+                sample = sample.replace(special_token, "<placeholder>" * num_image_tokens, 1)
+            prompt_strings.append(sample)
+        text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
+        return text, max_num_vision_tokens
+
+    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
+        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
+
+        height_best_resolution, width_best_resolution = select_best_resolution(
+            [orig_height, orig_width], image_grid_pinpoints
+        )
+        scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
+
+        patches_height = patches_width = int(math.sqrt(self.num_image_tokens))
+        unpadded_features, newline_features = self._get_unpadded_features(
+            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
+        )
+
+        # The base patch covers the entire image (no CLS for SigLIP)
+        base_features = self.num_image_tokens
+        num_image_tokens = unpadded_features + base_features
+        return num_image_tokens
+
+    # Adapted from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_unpadded_features
+    def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
+        """
+        Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA
+        because it divided each image into patches depending on its resolution. Therefore we need to calculate how many
+        patches an image is divided into and get the number of features from that.
+        """
+        current_height = patches_height * scale_height
+        current_width = patches_width * scale_width
+
+        original_aspect_ratio = width / height
+        current_aspect_ratio = current_width / current_height
+        if original_aspect_ratio > current_aspect_ratio:
+            new_height = int(round(height * (current_width / width), 7))
+            padding = (current_height - new_height) // 2
+            current_height -= padding * 2
+        else:
+            new_width = int(round(width * (current_height / height), 7))
+            padding = (current_width - new_width) // 2
+            current_width -= padding * 2
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+
+        max_num_patches = int(self.vision_aspect_ratio.strip("anyres_max_"))
+        ratio = math.sqrt(current_height * current_width / (max_num_patches * patches_height**2))
+        if ratio > 1.1:
+            unpadded_features = int(current_height // ratio) * int(current_width // ratio)
+            newline_features = int(current_height // ratio)
+
+        return (unpadded_features, newline_features)
+
+    def _compute_unpad_indices(
+        self,
+        orig_height: int,
+        orig_width: int,
+        patches_height: int,
+        patches_width: int,
+        scale_height: int,
+        scale_width: int,
+        features_per_patch: int,
+    ) -> np.ndarray:
+        """
+        Compute indices to reorder Levanter's padded features to HF's unpadded order.
+
+        HF's pack_image_features applies spatial unpadding based on original image aspect ratio.
+        This function computes the mapping from HF's feature positions to Levanter's sequential
+        feature layout.
+
+        Args:
+            orig_height: Original image height
+            orig_width: Original image width
+            patches_height: Number of patches per tile in height (e.g., 27)
+            patches_width: Number of patches per tile in width (e.g., 27)
+            scale_height: Number of tiles in height (e.g., 3 for 3x3 grid)
+            scale_width: Number of tiles in width (e.g., 3 for 3x3 grid)
+            features_per_patch: Features per patch/tile (e.g., 729)
+
+        Returns:
+            unpad_indices: Array of shape (num_unpadded_features,) where
+                          unpad_indices[i] = Levanter index for HF position i
+        """
+        # Base features are identity mapping (base patch is always first)
+        base_indices = np.arange(features_per_patch)
+
+        # Grid spatial dimensions after combining all tiles
+        curr_height = patches_height * scale_height  # e.g., 81 for 3x3 grid of 27x27 patches
+        curr_width = patches_width * scale_width
+
+        # Compute unpadding bounds based on original aspect ratio
+        # This matches HF's unpad_image logic
+        original_aspect_ratio = orig_width / orig_height
+        current_aspect_ratio = curr_width / curr_height
+
+        if original_aspect_ratio > current_aspect_ratio:
+            # Wider image - remove top/bottom padding
+            scale_factor = curr_width / orig_width
+            new_height = int(round(orig_height * scale_factor, 7))
+            padding = (curr_height - new_height) // 2
+            row_start = padding
+            row_end = curr_height - padding  # Symmetric padding like HF
+            col_start = 0
+            col_end = curr_width
+        else:
+            # Taller image - remove left/right padding
+            scale_factor = curr_height / orig_height
+            new_width = int(round(orig_width * scale_factor, 7))
+            padding = (curr_width - new_width) // 2
+            row_start = 0
+            row_end = curr_height
+            col_start = padding
+            col_end = curr_width - padding  # Symmetric padding like HF
+
+        # Build mapping from HF grid position to Levanter grid index
+        # HF order: row-major through unpadded region
+        # Levanter order: patch-by-patch (tile-by-tile), then row-major within each patch
+        grid_indices = []
+        for row in range(row_start, row_end):
+            for col in range(col_start, col_end):
+                # Convert global (row, col) to Levanter's patch-based index
+                # Which tile (patch) does this position belong to?
+                tile_row = row // patches_height
+                tile_col = col // patches_width
+                # Local position within the tile
+                local_row = row % patches_height
+                local_col = col % patches_width
+
+                # Tile index in row-major order (0-indexed grid patch, excluding base)
+                tile_idx = tile_row * scale_width + tile_col
+                # Local feature index within the tile
+                local_idx = local_row * patches_width + local_col
+
+                # Levanter index: base_features + tile_idx * features_per_patch + local_idx
+                # +1 because tile_idx=0 is the first grid tile, but Levanter's patch 0 is the base
+                lev_idx = features_per_patch + tile_idx * features_per_patch + local_idx
+                grid_indices.append(lev_idx)
+
+        return np.concatenate([base_indices, np.array(grid_indices, dtype=np.int32)])
+
+    def compute_unpad_indices(
+        self,
+        image_sizes: list,
+        height: int,
+        width: int,
+        max_num_features: int,
+    ) -> np.ndarray:
+        """
+        Compute unpad indices for a batch of images.
+
+        Args:
+            image_sizes: List of (orig_height, orig_width) tuples for each image
+            height: Processed tile height (e.g., 384)
+            width: Processed tile width (e.g., 384)
+            max_num_features: Maximum number of features to pad to
+
+        Returns:
+            unpad_indices: Array of shape (batch, max_num_features) padded with zeros
+        """
+        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
+        patches_height = patches_width = int(math.sqrt(self.num_image_tokens))
+
+        batch_indices = []
+        for orig_height, orig_width in image_sizes:
+            # Find best resolution for this image
+            height_best_resolution, width_best_resolution = select_best_resolution(
+                [orig_height, orig_width], image_grid_pinpoints
+            )
+            scale_height = height_best_resolution // height
+            scale_width = width_best_resolution // width
+
+            # Compute unpad indices for this image
+            indices = self._compute_unpad_indices(
+                orig_height,
+                orig_width,
+                patches_height,
+                patches_width,
+                scale_height,
+                scale_width,
+                self.num_image_tokens,
+            )
+            batch_indices.append(indices)
+
+        # Pad all indices to max_num_features
+        padded_indices = np.zeros((len(batch_indices), max_num_features), dtype=np.int32)
+        for i, indices in enumerate(batch_indices):
+            padded_indices[i, : len(indices)] = indices
+
+        return padded_indices
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (list[list[str]], *optional*):
+                The input sizes formatted as (height, width) per each image.
+            video_sizes (list[list[str]], *optional*):
+                The input sizes formatted as (num_frames, height, width) per each video.
+            audio_lengths (list[int], *optional*):
+                The input length formatted as per each audio.
+        Returns:
+            dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
+            to a list containing the number of placeholder tokens required. If the model doesn't accept
+            a certain modality or no input sizes are provided, the dict value is set to an empty list.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = LlavaOnevisionProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            size = images_kwargs.get("size", None) or self.image_processor.size
+            assert isinstance(size, dict)  # size should be a dict with height/width or shortest_edge
+            size = (
+                (size["shortest_edge"], size["shortest_edge"])
+                if "shortest_edge" in size
+                else (min(size["height"], size["width"]), min(size["height"], size["width"]))
+            )
+            processed_height, processed_width = size
+
+            batch_num_image_tokens = []
+            num_image_patches = [1] * len(image_sizes)  # llava-ov doesn't batch pixels as Idefics, thus `1` patch`
+            for image_size in image_sizes:
+                orig_height, orig_width = image_size
+                num_image_tokens = self._get_number_of_features(
+                    orig_height, orig_width, processed_height, processed_width
+                )
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+                batch_num_image_tokens.append(num_image_tokens)
+            vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+
+DEFAULT_IMAGE_GRID_PINPOINTS = [
+    [384, 384],
+    [384, 768],
+    [384, 1152],
+    [768, 384],
+    [768, 768],
+    [768, 1152],
+    [1152, 384],
+    [1152, 768],
+    [1152, 1152],
+]
+
+
+def create_custom_processor(model_name, do_pad=True, image_grid_pinpoints=None, max_image_tiles=None):
+    """
+    Create a LlavaOnevisionProcessor with custom do_pad setting.
+
+    Args:
+        model_name: HuggingFace model name
+        do_pad: Whether to pad image patches (True for Levanter, False for HF reference)
+        image_grid_pinpoints: Optional custom grid pinpoints. If None, uses DEFAULT_IMAGE_GRID_PINPOINTS.
+        max_image_tiles: Maximum number of image tiles (including base) for padding mode.
+                         For anyres_max_9, this would be 10 (9 + 1 base).
+                         Required when using padding_mode=True when calling the processor.
+    """
+    from transformers import AutoTokenizer, AutoConfig, AutoImageProcessor, AutoProcessor
+
+    if image_grid_pinpoints is None:
+        image_grid_pinpoints = DEFAULT_IMAGE_GRID_PINPOINTS
+
+    # Load config
+    config = AutoConfig.from_pretrained(model_name)
+
+    # Load the HF processor to get the chat template
+    hf_processor = AutoProcessor.from_pretrained(model_name)
+    chat_template = hf_processor.chat_template
+
+    # Load tokenizer from HF
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    # Load image processor from HF and configure do_pad
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    image_processor.do_pad = do_pad
+    image_processor.image_grid_pinpoints = image_grid_pinpoints
+
+    # Calculate num_image_tokens (patches per image = (image_size / patch_size)^2)
+    image_size = config.vision_config.image_size  # e.g., 384
+    patch_size = config.vision_config.patch_size  # e.g., 14
+    num_image_tokens = (image_size // patch_size) ** 2  # e.g., 729
+
+    # Create the custom processor with required parameters
+    processor = LlavaOnevisionProcessor(
+        image_processor=image_processor,
+        tokenizer=tokenizer,
+        num_image_tokens=num_image_tokens,
+        vision_feature_select_strategy=config.vision_feature_select_strategy,
+        vision_aspect_ratio=config.vision_aspect_ratio,
+        chat_template=chat_template,
+        max_image_tiles=max_image_tiles,
+    )
+    return processor
+
+
+class ImageDataLoader(DataLoader):
+    """
+    Data loader for image-text (VLM) data.
+
+    This loader extends DataLoader with special handling for vision-language models:
+    - Variable number of image patches per example
+    - Multiple fields (pixel_values, input_ids, image_sizes, labels, loss_mask)
+    - Proper batching and padding for TPU efficiency
+
+    The loader expects data in ImageTextDict format from ProcessedImageCache.
+    """
+
+    def __init__(
+        self,
+        data: AsyncDataset[ImageTextDict],
+        batch_size: int | IntSchedule | hax.Axis,
+        *,
+        Pos: hax.Axis,
+        NumPatches: hax.Axis,
+        Channels: hax.Axis = hax.Axis("channels", 3),
+        Height: hax.Axis = hax.Axis("height", 384),
+        Width: hax.Axis = hax.Axis("width", 384),
+        batch_axis_name: str | None = None,
+        max_buffered_batches: int | None = 64,
+        mesh: Mesh | None = None,
+        axis_resources: ResourceMapping | None = None,
+        prefetch_size: int = 32,
+        pad_final_batch: bool = True,
+        allow_nondivisible_batch_size: bool = False,
+        pixel_dtype: Optional[numpy.dtype] = None,
+        NumImageTokens: Optional[hax.Axis] = None,
+    ):
+        """
+        Initialize ImageDataLoader.
+
+        Args:
+            data: AsyncDataset providing ImageTextDict examples
+            batch_size: Batch size or schedule
+            Pos: Position axis for sequence length
+            NumPatches: Axis for number of image patches
+            Channels: Axis for image channels (default: 3)
+            Height: Axis for patch height (default: 384)
+            Width: Axis for patch width (default: 384)
+            batch_axis_name: Name for batch axis
+            max_buffered_batches: Max batches to buffer
+            mesh: JAX mesh for sharding
+            axis_resources: Resource mapping for sharding
+            prefetch_size: Number of batches to prefetch
+            pad_final_batch: Whether to pad final batch
+            allow_nondivisible_batch_size: Allow non-divisible batch sizes
+            pixel_dtype: dtype for pixel values (default: float32). Set to bfloat16 to save memory.
+            NumImageTokens: Axis for number of image tokens (for unpad_indices). If provided,
+                           unpad_indices will be included in batches for HF-compatible feature ordering.
+
+        Note:
+            grid_mask is computed during batching and included in the ImageTextExample data
+            for JIT-compatible VLM training.
+        """
+        # Set image-specific attributes before calling super().__init__()
+        # because _make_padding_example (called in super) may need these
+        self.Pos = Pos
+        self.NumPatches = NumPatches
+        self.Channels = Channels
+        self.Height = Height
+        self.Width = Width
+        self.NumImageTokens = NumImageTokens
+        self.pixel_dtype = pixel_dtype if pixel_dtype is not None else numpy.float32
+
+        # Call parent constructor
+        super().__init__(
+            data=data,
+            batch_size=batch_size,
+            batch_axis_name=batch_axis_name,
+            max_buffered_batches=max_buffered_batches,
+            mesh=mesh,
+            axis_resources=axis_resources,
+            prefetch_size=prefetch_size,
+            pad_final_batch=pad_final_batch,
+            allow_nondivisible_batch_size=allow_nondivisible_batch_size,
+        )
+
+    def _make_padding_example(self, ex: ImageTextDict) -> ImageTextDict:
+        """Create a zero-padded example for padding incomplete batches."""
+        padding_dict: dict[str, Any] = {}
+        for key, value in ex.items():
+            if value is None:
+                padding_dict[key] = None
+            else:
+                padding_dict[key] = numpy.zeros_like(value)
+        return cast(ImageTextDict, padding_dict)
+
+    def iter_from_step(self, start_from_batch: int | None = None):
+        start_from_batch = int(start_from_batch) if start_from_batch is not None else None
+        return ImageDataLoaderIterator(self, start_from_batch=start_from_batch)
+
+
+class ImageDataLoaderIterator(DataLoaderIterator):
+    """Iterator for ImageDataLoader.
+
+    Inherits batch production and data retrieval from DataLoaderIterator,
+    overriding only the image-specific batching logic.
+    """
+
+    def _pspec_for(self, shape_spec: ShapeSpec | NamedShapeSpec | tuple) -> PartitionSpec:
+        """Get partition spec for a given set of axes."""
+        if isinstance(shape_spec, NamedShapeSpec):
+            return hax.partitioning.pspec_for_axis(shape_spec.shape, self.dl.axis_resources)
+        elif isinstance(shape_spec, tuple) and len(shape_spec) > 0 and isinstance(shape_spec[0], hax.Axis):
+            # Handle tuple of hax.Axis objects directly
+            return hax.partitioning.pspec_for_axis(shape_spec, self.dl.axis_resources)
+        else:
+            # ShapeSpec - shouldn't happen for image data, but handle it for type safety
+            batch_name = hax.partitioning.physical_axis_name(self.dl.batch_axis_name, self.dl.axis_resources)
+            return PartitionSpec(batch_name, *((None,) * (len(shape_spec.shape) - 1)))
+
+    def _batchify_local_data(self, batch: _Batch[ImageTextDict]) -> ImageTextExample:
+        """
+        Stack individual ImageTextDict examples into a batched ImageTextExample.
+        Uses jax.make_array_from_callback for proper device placement.
+        """
+        padded_batch_size = self.dl._round_batch_size(batch.global_size)
+        Batch = hax.Axis(self.dl.batch_axis_name, padded_batch_size)
+
+        # Get target sizes from the axes
+        target_num_patches = self.dl.NumPatches.size
+
+        # Determine axes for each field
+        if target_num_patches > 1:
+            pixel_axes = (Batch, self.dl.NumPatches, self.dl.Channels, self.dl.Height, self.dl.Width)
+        else:
+            pixel_axes = (Batch, self.dl.Channels, self.dl.Height, self.dl.Width)
+        input_axes = (Batch, self.dl.Pos)
+
+        # Cache for local data
+        local_data_cache: dict[int, ImageTextDict] = {}
+
+        def get_local_data(idx: int) -> ImageTextDict:
+            if idx not in local_data_cache:
+                if idx in batch.data_by_local_index:
+                    local_data_cache[idx] = batch.data_by_local_index[idx]
+                else:
+                    local_data_cache[idx] = self.dl._padding_example
+            return local_data_cache[idx]
+
+        # Helper to create sharded arrays
+        def make_sharded_array(
+            shape: tuple[int, ...],
+            axes: tuple[hax.Axis, ...],
+            dtype: numpy.dtype,
+            get_data_fn,
+        ) -> hax.NamedArray:
+            """Create a properly sharded NamedArray."""
+            pspec = self._pspec_for(axes)
+            sharding = jax.sharding.NamedSharding(self.dl.mesh, pspec)
+
+            def callback(indices):
+                batch_slice = indices[0]
+                begin, end, stride = batch_slice.indices(padded_batch_size)
+                assert stride == 1, "Stride must be 1"
+
+                # Collect data for this slice
+                data_list = []
+                for i in range(begin, end):
+                    data_list.append(get_data_fn(get_local_data(i)))
+
+                stacked = numpy.stack(data_list, axis=0)
+                # Apply remaining indices
+                other_indices = indices[1:]
+                if not all(idx == slice(None) for idx in other_indices):
+                    stacked = stacked[(..., *other_indices)]
+                return stacked
+
+            raw_array = jax.make_array_from_callback(shape, sharding, callback)
+            return hax.NamedArray(raw_array, axes)
+
+        # Create pixel_values
+        pixel_shape = tuple(ax.size for ax in pixel_axes)
+
+        def get_pixel_values(d: ImageTextDict) -> numpy.ndarray:
+            # Padding is done in BatchImageProcessor, so pixel_values already has fixed shape
+            return d["pixel_values"].astype(self.dl.pixel_dtype)
+
+        pixel_values = make_sharded_array(pixel_shape, pixel_axes, self.dl.pixel_dtype, get_pixel_values)
+
+        # Create input_ids
+        input_shape = tuple(ax.size for ax in input_axes)
+
+        def get_input_ids(d: ImageTextDict) -> numpy.ndarray:
+            return d["input_ids"].astype(numpy.int32)
+
+        input_ids = make_sharded_array(input_shape, input_axes, numpy.int32, get_input_ids)
+
+        # Get loss_mask directly from preprocessed data
+        def get_loss_mask(d: ImageTextDict) -> numpy.ndarray:
+            return d["loss_mask"].astype(numpy.float32)
+
+        loss_mask = make_sharded_array(input_shape, input_axes, numpy.float32, get_loss_mask)
+
+        # Create grid_mask as a NamedArray for JIT-compatible VLM training
+        # grid_mask indicates which patches are valid (True) vs padding (False)
+        grid_mask_axes = (Batch, self.dl.NumPatches)
+        grid_mask_shape = (padded_batch_size, target_num_patches)
+
+        def get_grid_mask(d: ImageTextDict) -> numpy.ndarray:
+            # grid_mask is pre-computed in BatchImageProcessor with fixed shape
+            return d["grid_mask"]
+
+        grid_mask = make_sharded_array(grid_mask_shape, grid_mask_axes, numpy.bool_, get_grid_mask)
+
+        # Create unpad_indices if NumImageTokens is configured
+        unpad_indices = None
+        if self.dl.NumImageTokens is not None:
+            unpad_axes = (Batch, self.dl.NumImageTokens)
+            unpad_shape = (padded_batch_size, self.dl.NumImageTokens.size)
+
+            def get_unpad_indices(d: ImageTextDict) -> numpy.ndarray:
+                # unpad_indices is pre-computed in BatchImageProcessor with fixed shape
+                return d["unpad_indices"].astype(numpy.int32)
+
+            unpad_indices = make_sharded_array(unpad_shape, unpad_axes, numpy.int32, get_unpad_indices)
+
+        return ImageTextExample(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            loss_mask=loss_mask,
+            grid_mask=grid_mask,
+            unpad_indices=unpad_indices,
+        )
+
+    async def _do_retrieve_batch_of_batches(self, batch_specs: list[_Batch[None]]) -> list[_Batch[ImageTextDict]]:
+        """Retrieve the data for a batch of batches."""
+        global_indices_for_each_batch = []
+
+        for batch in batch_specs:
+            global_offset = batch.global_data_offset
+            local_indices_for_device = self.dl.local_data_indices_by_device_for_step(batch.index)
+
+            distinct_local_indices_this_batch = set()
+            for indices in local_indices_for_device.values():
+                for local_index in indices:
+                    if local_index >= batch.global_size:
+                        continue
+                    distinct_local_indices_this_batch.add(local_index)
+
+            global_indices_for_this_batch = [global_offset + i for i in distinct_local_indices_this_batch]
+            global_indices_for_each_batch.append(global_indices_for_this_batch)
+
+        indices_for_this_batch_of_batches: list[int] = [
+            i for indices in global_indices_for_each_batch for i in indices
+        ]
+
+        individual_datums = await self.run_and_report_slowness(
+            self.dl.data_store.get_batch(indices_for_this_batch_of_batches),
+            f"Waiting for {len(indices_for_this_batch_of_batches)} image items.",
+        )
+
+        global_map: dict[int, ImageTextDict] = dict(zip(indices_for_this_batch_of_batches, individual_datums))
+
+        out: list[_Batch[ImageTextDict]] = []
+
+        for batch, global_indices_batch in zip(batch_specs, global_indices_for_each_batch, strict=False):
+            local_index_to_example = {}
+            for global_index in global_indices_batch:
+                local_index = global_index - batch.global_data_offset
+                local_index_to_example[local_index] = global_map[global_index]
+
+            out.append(dataclasses.replace(batch, data_by_local_index=local_index_to_example))
+
+        return out
diff --git a/lib/levanter/src/levanter/data/loader.py b/lib/levanter/src/levanter/data/loader.py
index cb2e6005a0..a52af6d6f1 100644
--- a/lib/levanter/src/levanter/data/loader.py
+++ b/lib/levanter/src/levanter/data/loader.py
@@ -46,7 +46,6 @@
 _TensorSliceIndex = tuple[slice, ...]
 logger = logging.getLogger(__name__)
 
-
 # NOTE: In general there are a lot of different indices flying around. Here's a quick guide:
 # - `step` or `batch_number` or `bn` is the training step or batch number
 # - `global` indices refer to the index into the datastore
@@ -136,7 +135,7 @@ def __init__(
 
             initial_example = blocking_wait(self.data_store.getitem_async(0))
             self._ex_leaves, self._ex_structure = jax.tree.flatten(initial_example, is_leaf=is_named_array)
-            self._padding_example = _make_padding_example(initial_example)
+            self._padding_example = self._make_padding_example(initial_example)
 
         if not self._allow_non_divisible_batch_size:
             self._check_batch_size_divisibility()
@@ -233,6 +232,11 @@ def __len__(self):
         step = self.scheduler.find_step_containing_offset(total_length) + 1
         return step
 
+    def _make_padding_example(self, ex: Ex) -> Ex:
+        """Create a padding example for incomplete batches. Can be overridden by subclasses."""
+        with local_cpu_mesh():
+            return tree_zeros_like(ex)
+
 
 class DataLoaderIterator(Iterator[Ex]):
     def __init__(self, data_loader: DataLoader, start_from_batch: int | None = None):
@@ -631,10 +635,5 @@ def _to_tuple(index: tuple[slice, ...]) -> tuple[tuple[int, int], ...]:
         check_array(leaf)
 
 
-def _make_padding_example(ex: Ex) -> Ex:
-    with local_cpu_mesh():
-        return tree_zeros_like(ex)
-
-
 def _round_to_nearest_multiple(x: int, multiple: int) -> int:
     return ((x + multiple - 1) // multiple) * multiple
diff --git a/lib/levanter/src/levanter/layers/attention.py b/lib/levanter/src/levanter/layers/attention.py
index 966ad30bba..9fc9462b07 100644
--- a/lib/levanter/src/levanter/layers/attention.py
+++ b/lib/levanter/src/levanter/layers/attention.py
@@ -13,6 +13,7 @@
 
 import equinox as eqx
 import jax
+import numpy as np
 import jax.random as jrandom
 from equinox import Partial
 from jax import numpy as jnp
@@ -1426,7 +1427,26 @@ def _compatible_block(shard_len: int, max_block: int) -> int:
             )
             base_mask = splash_attention_mask.LogicalAnd(base_mask, local_mask)
         if mask.explicit_mask is not None:
-            raise NotImplementedError("Explicit masks are not yet supported for splash attention")
+            # Convert NamedArray explicit_mask to numpy boolean array for NumpyMask
+            # explicit_mask should have shape compatible with (Sq, Sk)
+            # Note: This will fail during JIT tracing if the mask is dynamic
+            try:
+                explicit_np = np.asarray(mask.explicit_mask.array, dtype=np.bool_)
+            except jax.errors.TracerArrayConversionError:
+                raise NotImplementedError(
+                    "Explicit masks with dynamic values are not supported for splash attention. "
+                    "The mask must be a static numpy array at compile time."
+                )
+
+            # Ensure correct shape (Sq, Sk) - may need to squeeze or reshape
+            if explicit_np.shape != (Sq, Sk):
+                raise ValueError(
+                    f"explicit_mask shape {explicit_np.shape} does not match " f"expected shape ({Sq}, {Sk})"
+                )
+
+            # Create NumpyMask and combine with base_mask using LogicalAnd
+            explicit_splash_mask = splash_attention_mask.NumpyMask(explicit_np)
+            base_mask = splash_attention_mask.LogicalAnd(base_mask, explicit_splash_mask)
     elif isinstance(mask, NamedArray):
         raise NotImplementedError("NamedArray masks are not yet supported for splash attention")
     else:
diff --git a/lib/levanter/src/levanter/main/train_vlm.py b/lib/levanter/src/levanter/main/train_vlm.py
new file mode 100644
index 0000000000..89eaf34797
--- /dev/null
+++ b/lib/levanter/src/levanter/main/train_vlm.py
@@ -0,0 +1,630 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Training script for Vision-Language Models (VLM) like LLaVA OneVision.
+
+This module provides training functionality for multimodal models that combine
+vision encoders with language models.
+"""
+
+import dataclasses
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional, Union, cast
+
+import numpy as np
+
+import jax
+import jax.numpy as jnp
+import jax.random as jrandom
+
+import haliax as hax
+from haliax import Axis
+from haliax.partitioning import named_jit, round_axis_for_partitioning
+from haliax.state_dict import from_torch_compatible_state_dict
+
+import levanter
+from levanter import callbacks
+from levanter.compat.hf_checkpoints import HFCheckpointConverter, save_hf_checkpoint_callback
+from levanter.data.image import (
+    ImageIODatasetConfig,
+    ImageDataLoader,
+    ImageMixtureDatasetConfig,
+    ImageTextDataset,
+)
+from levanter.models.llava_onevision import LlavaOnevisionConfig, LlavaOnevisionModel
+from levanter.optim import AdamConfig, OptimizerConfig
+from levanter.trainer import Trainer, TrainerConfig
+from levanter.utils.jax_utils import parameter_count
+
+
+logger = logging.getLogger(__name__)
+
+# Constants for VLM training configuration
+DEFAULT_NUM_PATCHES = 3 * 3 + 1  # 3x3 grid + base image for default anyres_max_9 config
+STREAMING_MAX_BUFFERED_BATCHES = 4  # Memory-efficient buffering for streaming mode
+STREAMING_PREFETCH_SIZE = 2  # Minimal prefetch to avoid OOM in streaming mode
+
+
+def _load_vision_weights(model, checkpoint_path, axis_mapping, mp):
+    """Load vision encoder weights from a separate HuggingFace checkpoint.
+
+    Args:
+        model: The LlavaOnevisionModel to load weights into
+        checkpoint_path: HuggingFace checkpoint path (e.g., 'google/siglip-so400m-patch14-384')
+        axis_mapping: Axis mapping for sharding
+        mp: Mixed precision policy
+
+    Returns:
+        Model with vision weights loaded
+    """
+    from transformers import SiglipConfig as HfSiglipConfig
+
+    # Create converter to load state dict from HF checkpoint
+    vision_config = model.config.vision_config
+    converter = HFCheckpointConverter(
+        vision_config.__class__,
+        reference_checkpoint=checkpoint_path,
+        trust_remote_code=True,
+        tokenizer="gpt2",  # Dummy tokenizer for vision-only model
+        HfConfigClass=HfSiglipConfig,
+    )
+
+    # Load state dict from HF checkpoint
+    state_dict = converter.load_state_dict()
+
+    # The HF SigLIP model has weights under "vision_model." prefix
+    # Our SiglipVisionModel also uses "vision_model." prefix, so they should match
+    # Use the existing vision_tower as template and load weights into it
+    vision_tower = model.vision_tower
+    vision_tower = from_torch_compatible_state_dict(vision_tower, state_dict, prefix=None)
+
+    # Replace vision tower in the model
+    model = dataclasses.replace(model, vision_tower=vision_tower)
+    logger.info(f"Loaded vision weights from {checkpoint_path}")
+    return model
+
+
+def _load_llm_weights(model, checkpoint_path, axis_mapping, mp, Vocab):
+    """Load language model weights from a separate HuggingFace checkpoint.
+
+    Args:
+        model: The LlavaOnevisionModel to load weights into
+        checkpoint_path: HuggingFace checkpoint path (e.g., 'Qwen/Qwen3-1.7B')
+        axis_mapping: Axis mapping for sharding
+        mp: Mixed precision policy
+        Vocab: Vocabulary axis
+
+    Returns:
+        Model with LLM weights loaded
+    """
+    from transformers import Qwen3Config as HfQwen3Config
+
+    # Create converter to load state dict from HF checkpoint
+    text_config = model.config.text_config
+    converter = HFCheckpointConverter(
+        text_config.__class__,
+        reference_checkpoint=checkpoint_path,
+        trust_remote_code=True,
+        HfConfigClass=HfQwen3Config,
+    )
+
+    # Load state dict from HF checkpoint
+    state_dict = converter.load_state_dict()
+
+    # The HF Qwen3 model has weights under "model." prefix for the transformer
+    # and "lm_head." for the output layer
+    # Use the existing language_model as template and load weights into it
+    language_model = model.language_model
+    language_model = from_torch_compatible_state_dict(language_model, state_dict, prefix=None)
+
+    # Replace language model in the model
+    model = dataclasses.replace(model, language_model=language_model)
+    logger.info(f"Loaded LLM weights from {checkpoint_path}")
+    return model
+
+
+def _compute_max_num_patches(config, first_ex=None):
+    """Compute maximum number of grid patches for anyres image processing.
+
+    This returns the max number of GRID patches (excluding the base patch).
+    The total patches = max_num_patches + 1 (for base) is computed in _pad_pixel_values().
+
+    Args:
+        config: VLM training config with model.image_grid_pinpoints and vision_config
+        first_ex: Optional first example from dataset for fallback
+
+    Returns:
+        Maximum number of grid patches (excluding base)
+    """
+    grid_pinpoints = config.model.image_grid_pinpoints
+    patch_size = config.model.vision_config.image_size
+
+    if grid_pinpoints:
+        max_resolution = max(max(h, w) for h, w in grid_pinpoints)
+        max_patches_per_dim = max_resolution // patch_size
+        # Return grid patches only; +1 for base is added in _pad_pixel_values()
+        return max_patches_per_dim * max_patches_per_dim
+    elif first_ex is not None:
+        # first_ex has total patches (including base), subtract 1 for grid patches only
+        return first_ex["pixel_values"].shape[0] - 1
+    else:
+        return DEFAULT_NUM_PATCHES
+
+
+def _get_vocab_size_from_hf_config(hf_config):
+    """Extract vocab_size from HuggingFace config, handling nested text_config."""
+    vocab_size = getattr(hf_config, "vocab_size", None)
+    if vocab_size is None and hasattr(hf_config, "text_config"):
+        vocab_size = hf_config.text_config.vocab_size
+    return vocab_size
+
+
+def _get_first_example(dataset):
+    """Extract the first example from a dataset (cached or streaming).
+
+    This is used to determine image axes (Channels, Height, Width) from actual data.
+    For streaming datasets, this uses get_batch which processes data on-the-fly
+    without affecting subsequent iteration.
+
+    Args:
+        dataset: An AsyncDataset or MixtureDataset
+
+    Returns:
+        The first example dict, or None if extraction failed
+    """
+    import asyncio
+
+    try:
+        # MixtureDataset case - get from first underlying dataset
+        if hasattr(dataset, "datasets"):
+            first_ds = next(iter(dataset.datasets.values()))
+            return _get_first_example(first_ds)
+
+        # ProcessedImageCache case - use cache directly
+        if hasattr(dataset, "cache"):
+            return dataset.cache.get_batch_sync([0])[0]
+
+        # StreamingImageDataset or other AsyncDataset - use get_batch
+        if hasattr(dataset, "get_batch"):
+            # Run async get_batch synchronously
+            loop = asyncio.new_event_loop()
+            try:
+                result = loop.run_until_complete(dataset.get_batch([0]))
+                return result[0]
+            finally:
+                loop.close()
+
+        return None
+    except Exception as e:
+        logger.warning(f"Failed to extract first example: {e}")
+        return None
+
+
+def _determine_vocab_size(config, converter, tokenizer):
+    """Determine the vocab size to use for model initialization.
+
+    Prioritizes HF checkpoint vocab size over tokenizer vocab size when loading
+    from checkpoints, as HF models may pad vocab for efficiency.
+
+    Returns:
+        tuple: (vocab_size, source_description) for logging
+    """
+    tokenizer_vocab_size = len(tokenizer)
+
+    if config.initialize_from_hf and converter is not None:
+        hf_vocab_size = _get_vocab_size_from_hf_config(converter.default_hf_config)
+        if hf_vocab_size is not None and hf_vocab_size > tokenizer_vocab_size:
+            return hf_vocab_size, f"HF checkpoint vocab size {hf_vocab_size} (tokenizer has {tokenizer_vocab_size})"
+
+    elif config.llm_checkpoint:
+        from transformers import AutoConfig
+
+        llm_hf_config = AutoConfig.from_pretrained(config.llm_checkpoint, trust_remote_code=True)
+        hf_vocab_size = _get_vocab_size_from_hf_config(llm_hf_config)
+        if hf_vocab_size is not None and hf_vocab_size > tokenizer_vocab_size:
+            return hf_vocab_size, f"LLM checkpoint vocab size {hf_vocab_size} (tokenizer has {tokenizer_vocab_size})"
+
+    return tokenizer_vocab_size, None
+
+
+def compute_vlm_loss(
+    model: LlavaOnevisionModel,
+    example,
+    *,
+    key=None,
+    reduction: Optional[hax.ReductionFunction] = cast(Optional[hax.ReductionFunction], hax.mean),
+    reduction_axis: Optional[hax.AxisSelection] = None,
+    block_size: Optional[int] = 4096,
+) -> jax.numpy.ndarray | hax.NamedArray:
+    """Compute the loss for a VLM example using blockwise cross-entropy.
+
+    This computes masked cross-entropy loss consistent with HuggingFace's implementation:
+    loss = -sum(log_probs * mask) / sum(mask)
+
+    Uses blockwise cross-entropy to avoid materializing the full logits tensor
+    (batch * seq * vocab), which can cause OOM for large vocab sizes.
+
+    Only tokens where loss_mask > 0 contribute to the loss. This is important for
+    VLM training where we typically mask out image tokens and user prompts.
+
+    Args:
+        model: The LlavaOnevisionModel to compute loss for.
+        example: A batch containing input_ids, pixel_values, and optionally loss_mask.
+        key: Random key for any stochastic operations.
+        reduction: Reduction function to apply to the loss (default: hax.mean).
+            Note: When loss_mask is present, we use HF-compatible masked mean
+            (sum of masked losses / count of valid tokens) instead of simple mean.
+        reduction_axis: Axis to reduce over.
+        block_size: Block size for blockwise cross-entropy computation.
+            Set to None to use full logits (may cause OOM for large vocab).
+
+    Returns:
+        The computed loss value.
+    """
+    from levanter.models.loss import fused_cross_entropy_loss_and_logsumexp_penalty
+
+    # Forward pass through the model
+    # Get grid_mask and unpad_indices from example (for fixed-shape processing)
+    grid_mask = getattr(example, "grid_mask", None)
+    unpad_indices = getattr(example, "unpad_indices", None)
+
+    # Use forward_with_activations for blockwise computation
+    activations, lm_head = model.forward_with_activations(
+        example.input_ids,
+        pixel_values=example.pixel_values,
+        grid_mask=grid_mask,
+        unpad_indices=unpad_indices,
+        key=key,
+    )
+
+    # Get axes for cross-entropy computation
+    Pos = example.input_ids.resolve_axis("position")
+    Embed = model.config.TextEmbed
+    Vocab = model.Vocab
+
+    # Get targets (shifted by 1 for next-token prediction)
+    targets = hax.roll(example.input_ids, -1, Pos)
+
+    # Compute loss weight from loss_mask
+    if example.loss_mask is not None:
+        # Shift loss mask to align with targets
+        loss_weight = hax.roll(example.loss_mask, -1, Pos)
+    else:
+        # Create a mask that excludes the last token
+        not_last_mask = hax.logical_not(hax.nn.one_hot(-1, Pos, dtype=jnp.bool_))
+        loss_weight = not_last_mask.astype(jnp.float32)
+
+    # Use fused_cross_entropy_loss for blockwise computation
+    # This avoids materializing the full (batch, seq, vocab) logits tensor
+    per_token_loss = fused_cross_entropy_loss_and_logsumexp_penalty(
+        pred_embeddings=activations,
+        pred_lm_head=lm_head,
+        Contract=Embed,
+        Label=Vocab,
+        target_y=targets,
+        reduction=None,  # We'll handle reduction ourselves for masked loss
+        weight=None,  # We'll apply mask after
+        logsumexp_weight=0.0,
+        block_size=block_size,
+    )
+
+    # Apply loss mask if available (HuggingFace-consistent masked mean)
+    if example.loss_mask is not None:
+        masked_loss = per_token_loss * loss_weight
+
+        # Compute token-weighted loss across entire batch (more stable than per-example mean)
+        # This avoids division by zero for samples with no valid tokens
+        # loss = sum(masked_loss) / sum(mask) across all tokens in batch
+        total_masked_loss = hax.sum(masked_loss, axis=None)  # Sum all axes
+        total_mask = hax.sum(loss_weight, axis=None)  # Sum all axes
+
+        # Add small epsilon to avoid division by zero
+        loss = total_masked_loss / (total_mask + 1e-8)
+    else:
+        # No mask - use standard reduction
+        if reduction is not None:
+            loss = reduction(per_token_loss, axis=reduction_axis)
+        else:
+            loss = per_token_loss
+
+    return loss
+
+
+@dataclass
+class TrainVLMConfig:
+    """Configuration for training Vision-Language Models."""
+
+    data: Union[ImageIODatasetConfig, ImageMixtureDatasetConfig] = field(default_factory=ImageMixtureDatasetConfig)
+    trainer: TrainerConfig = field(default_factory=TrainerConfig)
+    model: LlavaOnevisionConfig = field(default_factory=LlavaOnevisionConfig)
+    optimizer: OptimizerConfig = field(default_factory=AdamConfig)
+
+    # config related to continued pretraining
+    initialize_from_hf: Union[bool, str] = False
+    """if provided, this will override the model config in the config. if true, use the default hf checkpoint for this model class"""
+    use_hf_model_config: bool = False  # if true, replace the model config with the hf config from the checkpoint
+    data_seed: Optional[int] = None  # if provided, will override the data seed from the trainer
+
+    hf_save_path: Optional[str] = None
+    hf_upload: Optional[str] = None
+    hf_save_steps: int = 10000
+
+    # Performance optimization options
+    freeze_vision_encoder: bool = False
+    """If True, freeze vision encoder weights during training (only train projector + LLM)."""
+    freeze_llm: bool = False
+    """If True, freeze LLM weights during training (only train projector + vision encoder)."""
+
+    # Custom weight loading for hybrid models (e.g., SigLIP + Qwen3)
+    vision_checkpoint: Optional[str] = None
+    """HuggingFace checkpoint for vision encoder (e.g., 'google/siglip-so400m-patch14-384')"""
+    llm_checkpoint: Optional[str] = None
+    """HuggingFace checkpoint for language model (e.g., 'Qwen/Qwen3-1.7B')"""
+
+    # Evaluation control
+    no_eval: bool = False
+    """If True, disable evaluation completely to save memory."""
+
+    # Epoch control
+    epoch: int = 0
+    """Number of epochs to train. If 0, train indefinitely until num_train_steps is reached."""
+
+
+def main(config: TrainVLMConfig):
+    """Main training function for VLM."""
+    tokenizer = config.data.the_tokenizer
+
+    # Handle HuggingFace checkpoint initialization
+    if config.initialize_from_hf:
+        if config.trainer.initialize_from is not None:
+            raise ValueError("Cannot specify both initialize_from_hf and initialize_from")
+
+        if isinstance(config.initialize_from_hf, str):
+            converter = config.model.hf_checkpoint_converter(ref_checkpoint=config.initialize_from_hf)
+            converter = converter.replaced(tokenizer=tokenizer)
+        else:
+            converter = config.model.hf_checkpoint_converter(
+                ref_checkpoint=config.data.processor  # Use processor path as reference
+            )
+            converter = converter.replaced(tokenizer=tokenizer)
+
+        if hasattr(tokenizer, "vocab") and converter.tokenizer is not None:
+            if tokenizer.vocab != converter.tokenizer.vocab:
+                logger.warning("The tokenizers appear to be different. You may want to check this.")
+
+        if config.use_hf_model_config:
+            config.model = LlavaOnevisionConfig.from_hf_config(converter.default_hf_config)
+            logger.info(
+                f"Using HF model config: vision_layers={config.model.vision_config.num_hidden_layers}, "
+                f"text_layers={config.model.text_config.num_layers}, "
+                f"hidden_dim={config.model.text_config.hidden_dim}"
+            )
+    else:
+        # Use processor path as reference checkpoint to get tokenizer
+        converter = config.model.hf_checkpoint_converter(ref_checkpoint=config.data.processor)
+        converter = converter.replaced(tokenizer=tokenizer)
+
+    levanter.initialize(config)
+    optimizer = config.optimizer.build(config.trainer.num_train_steps)
+
+    # Create loss function with optional freezing
+    if config.freeze_vision_encoder or config.freeze_llm:
+        # Wrap loss function to apply stop_gradient to frozen components
+        def compute_vlm_loss_with_freezing(model, example, **kwargs):
+            # Collect frozen components to replace in a single dataclasses.replace call
+            frozen_updates = {}
+            if config.freeze_vision_encoder:
+                frozen_updates["vision_tower"] = jax.lax.stop_gradient(model.vision_tower)
+            if config.freeze_llm:
+                frozen_updates["language_model"] = jax.lax.stop_gradient(model.language_model)
+
+            if frozen_updates:
+                model = dataclasses.replace(model, **frozen_updates)
+
+            return compute_vlm_loss(model, example, **kwargs)
+
+        loss_fn = compute_vlm_loss_with_freezing
+    else:
+        loss_fn = compute_vlm_loss
+
+    # Using the trainer as a context manager
+    with Trainer(config.trainer, optimizer, loss_fn) as trainer:
+        seed = config.trainer.seed
+        data_key, loader_key, model_key, training_key = jrandom.split(jrandom.PRNGKey(seed), 4)
+
+        parameter_axis_mapping = trainer.parameter_axis_mapping
+
+        # Get batch axes
+        Batch = config.trainer.TrainBatch
+
+        if config.data_seed is not None:
+            logger.info(f"Overriding data seed with {config.data_seed}")
+            data_key = jrandom.PRNGKey(config.data_seed)
+
+        # Compute max_num_patches early from model config (needed for streaming dataset creation)
+        # This ensures BatchImageProcessor pads to the correct size before the loader sees the data
+        max_num_patches = _compute_max_num_patches(config, first_ex=None)
+
+        # Build datasets - only build eval if no_eval not set
+        # Pass max_num_patches for streaming mode to ensure correct padding
+        if config.no_eval:
+            eval_datasets = {}
+        else:
+            eval_datasets = config.data.validation_sets(max_num_patches=max_num_patches)
+        train_dataset_mixture = config.data.train_set(
+            key=data_key, epochs=config.epoch, max_num_patches=max_num_patches
+        )
+
+        # Get shape info from first example (required for axes setup)
+        first_ex = _get_first_example(train_dataset_mixture)
+        if first_ex is None:
+            raise RuntimeError(
+                "Could not extract first example from dataset. "
+                "This is required to determine image axes (Channels, Height, Width)."
+            )
+
+        # Define axes from config (works for both cached and streaming modes)
+        Pos = hax.Axis("position", config.data.max_length)
+
+        # Recompute max_num_patches with first_ex for fallback (if grid_pinpoints not configured)
+        max_num_patches = _compute_max_num_patches(config, first_ex)
+
+        # Total patches = max_num_patches (grid) + 1 (base)
+        NumPatches = hax.Axis("num_patches", max_num_patches + 1)
+        Channels = hax.Axis("channels", first_ex["pixel_values"].shape[1])
+        Height = hax.Axis("height", first_ex["pixel_values"].shape[2])
+        Width = hax.Axis("width", first_ex["pixel_values"].shape[3])
+
+        # Determine pixel dtype based on trainer's compute precision
+        # This ensures data is transferred to TPU in the correct dtype to save memory
+        compute_dtype = trainer.mp.compute_dtype
+        logger.info(f"Using compute dtype {compute_dtype} for pixel values")
+
+        # Note: We use train_dataset_mixture (raw ImageTextDict) directly with ImageDataLoader
+        # instead of wrapping it in ImageTextDataset. The ImageDataLoader handles
+        # the conversion to ImageTextExample during batching.
+
+        # Determine vocab size - use HF checkpoint vocab if loading from HF, otherwise use tokenizer
+        vocab_size, vocab_source = _determine_vocab_size(config, converter, tokenizer)
+        if vocab_source:
+            logger.info(f"Using {vocab_source}")
+
+        # Round vocab size for partitioning
+        Vocab = round_axis_for_partitioning(Axis("vocab", vocab_size), parameter_axis_mapping)
+        if vocab_size != Vocab.size:
+            logger.info(f"Rounding vocab size from {vocab_size} to {Vocab.size} for partitioning")
+
+        # Initialize model
+        def model_init():
+            return LlavaOnevisionModel.init(Vocab, config.model, key=model_key)
+
+        # For freezing, we use is_trainable=True and handle gradient zeroing separately
+        # This avoids haliax partitioning issues with non-trivial is_trainable filters
+        state = trainer.initial_state(training_key, model_init=model_init, is_trainable=True)
+
+        # Log freezing info if requested
+        if config.freeze_vision_encoder or config.freeze_llm:
+            frozen_parts = []
+            if config.freeze_vision_encoder:
+                frozen_parts.append("vision encoder")
+            if config.freeze_llm:
+                frozen_parts.append("LLM")
+            logger.info(f"Freezing {' and '.join(frozen_parts)} - only projector will be trained")
+            logger.info("Note: Freezing is implemented via gradient zeroing during training.")
+
+        if int(state.step) == 0:
+            if config.initialize_from_hf:
+                assert converter is not None
+                logger.info(
+                    f"No training checkpoint found. Initializing model from HF checkpoint "
+                    f"'{converter.reference_checkpoint}'"
+                )
+                state = dataclasses.replace(state, model=None)
+                # Load with resize_vocab_to_match_tokenizer=False since we already use HF vocab size
+                model = converter.load_pretrained(
+                    LlavaOnevisionModel,
+                    axis_mapping=parameter_axis_mapping,
+                    resize_vocab_to_match_tokenizer=False,  # Keep HF vocab size (already set in model_init)
+                )
+                model = named_jit(trainer.mp.cast_to_param, parameter_axis_mapping)(model)
+                state = dataclasses.replace(state, model=model)
+            elif config.vision_checkpoint or config.llm_checkpoint:
+                # Custom weight loading for hybrid models (e.g., SigLIP + Qwen3)
+                logger.info("Loading weights from separate checkpoints...")
+                model = state.model
+
+                if config.vision_checkpoint:
+                    logger.info(f"Loading vision encoder from: {config.vision_checkpoint}")
+                    model = _load_vision_weights(model, config.vision_checkpoint, parameter_axis_mapping, trainer.mp)
+
+                if config.llm_checkpoint:
+                    logger.info(f"Loading LLM from: {config.llm_checkpoint}")
+                    model = _load_llm_weights(model, config.llm_checkpoint, parameter_axis_mapping, trainer.mp, Vocab)
+
+                model = named_jit(trainer.mp.cast_to_param, parameter_axis_mapping)(model)
+                state = dataclasses.replace(state, model=model)
+                logger.info("Custom weight loading completed.")
+            else:
+                logger.info("No checkpoint found. Starting from scratch.")
+
+        levanter.tracker.log_summary({"parameter_count": parameter_count(state.model)})
+
+        # Add eval hooks unless no_eval is set
+        if config.no_eval:
+            logger.info("Evaluation disabled (--no_eval). Skipping eval hooks to save memory.")
+        elif len(eval_datasets) == 0:
+            logger.warning("No evaluation datasets provided.")
+        else:
+            for name, eval_dataset in eval_datasets.items():
+                hax_eval_dataset = ImageTextDataset(
+                    eval_dataset,
+                    Position=Pos,
+                    NumPatches=NumPatches,
+                    Channels=Channels,
+                    Height=Height,
+                    Width=Width,
+                    ignore_index=config.data.pad_token_id,
+                    pixel_dtype=compute_dtype,  # Use same compute precision for eval
+                    grid_pinpoints=config.model.image_grid_pinpoints,
+                    patch_size=config.model.vision_config.image_size,
+                )
+                trainer.add_eval_hook(hax_eval_dataset, name=name)
+
+        trainer.add_hook(callbacks.log_performance_stats(Pos.size, trainer.config.train_batch_size), every=1)
+
+        if config.hf_save_path is not None:
+            assert converter is not None, "converter must be set when saving HF checkpoints"
+            full_save_path = os.path.join(config.hf_save_path, trainer.run_id)
+
+            trainer.add_hook(
+                save_hf_checkpoint_callback(full_save_path, converter, upload_to_hf=config.hf_upload or False),
+                every=config.hf_save_steps,
+            )
+
+        # Create data loader - ImageDataLoader converts raw ImageTextDict to ImageTextExample
+        # during batching, handling grid_mask computation and NamedArray creation
+        pixel_dtype = np.dtype(compute_dtype)
+
+        # Check if streaming mode for loader configuration
+        is_streaming = hasattr(config.data, "use_cache") and not config.data.use_cache
+
+        # Build loader kwargs with common parameters
+        loader_kwargs = {
+            "Pos": Pos,
+            "NumPatches": NumPatches,
+            "Channels": Channels,
+            "Height": Height,
+            "Width": Width,
+            "mesh": trainer.device_mesh,
+            "axis_resources": trainer.compute_axis_mapping,
+            "batch_axis_name": Batch.name,
+            "allow_nondivisible_batch_size": trainer.config.allow_nondivisible_batch_size,
+            "pixel_dtype": pixel_dtype,
+        }
+
+        if is_streaming:
+            # For streaming mode, use minimal prefetch to avoid OOM
+            loader_kwargs.update(
+                {
+                    "batch_size": trainer.config.train_batch_size,
+                    "max_buffered_batches": STREAMING_MAX_BUFFERED_BATCHES,
+                    "prefetch_size": STREAMING_PREFETCH_SIZE,
+                }
+            )
+            logger.info(
+                f"Using streaming mode with ImageDataLoader (prefetch_size={STREAMING_PREFETCH_SIZE}, max_buffered={STREAMING_MAX_BUFFERED_BATCHES})"
+            )
+        else:
+            loader_kwargs["batch_size"] = Batch
+
+        train_loader = ImageDataLoader(train_dataset_mixture, **loader_kwargs).iter_from_step(state.step)
+
+        # Run training
+        trainer.train(state, train_loader)
+
+
+if __name__ == "__main__":
+    levanter.config.main(main)()
diff --git a/lib/levanter/src/levanter/models/llava_onevision.py b/lib/levanter/src/levanter/models/llava_onevision.py
new file mode 100644
index 0000000000..ee38b0e2e5
--- /dev/null
+++ b/lib/levanter/src/levanter/models/llava_onevision.py
@@ -0,0 +1,1229 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass, replace
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+
+import jax
+import equinox as eqx
+import jax.numpy as jnp
+import jax.random as jrandom
+
+import haliax as hax
+import haliax.nn as hnn
+from haliax import Axis, NamedArray
+from haliax.jax_utils import maybe_rng_split, named_call
+
+from levanter.compat.hf_checkpoints import HFCheckpointConverter
+from levanter.layers.attention import AttentionMask
+from levanter.models.lm_model import LmConfig
+from levanter.models.qwen import QwenConfig, QwenLMHeadModel
+from levanter.models.siglip import SiglipVisionConfig, SiglipVisionModel
+from levanter.models.siglip2 import Siglip2VisionConfig, Siglip2VisionModel
+from levanter.inference.engine import InferenceEngine, Request
+from levanter.layers.kv_cache import KvPageCache, ListCache
+from levanter.utils.activation import ActivationFunctionEnum
+from levanter.utils.logging import silence_transformer_nag
+
+
+silence_transformer_nag()
+from transformers import PretrainedConfig as HfConfig  # noqa: E402
+from transformers import LlavaOnevisionConfig as HfLlavaOnevisionConfig  # noqa: E402
+
+
+@LmConfig.register_subclass("llava_onevision")
+@dataclass(frozen=True)
+class LlavaOnevisionConfig:
+    """
+    Configuration class for LLaVA OneVision multimodal model.
+
+    LLaVA OneVision combines a vision encoder (SigLIP or Siglip2) with a Qwen2/Qwen3 language model
+    through a multimodal projector.
+
+    Implements the VlmConfig interface (vision_config + text_config) via duck typing.
+
+    Args:
+        vision_config: Configuration for the vision encoder (SigLIP or Siglip2)
+        text_config: Configuration for the Qwen2/Qwen3 language model
+        vision_encoder_type: Type of vision encoder to use ("siglip" or "siglip2")
+        image_token_index: Token ID used to represent image patches in text
+        video_token_index: Token ID used to represent video frames in text
+        projector_hidden_act: Activation function for the multimodal projector
+        vision_feature_select_strategy: How to select vision features ("default" or "full")
+        vision_feature_layer: Which vision layer(s) to use for features (-1 for last layer)
+        vision_aspect_ratio: Aspect ratio strategy for image processing
+        image_grid_pinpoints: List of (height, width) resolutions for multi-scale processing
+        multimodal_projector_bias: Whether to use bias in the projector
+        gradient_checkpointing: Whether to use gradient checkpointing
+    """
+
+    vision_config: Union[SiglipVisionConfig, Siglip2VisionConfig]
+    text_config: QwenConfig
+    vision_encoder_type: str = "siglip"  # "siglip" or "siglip2"
+
+    image_token_index: int = 151646
+    video_token_index: int = 151647
+    pad_token_id: int = 151643  # Qwen's default pad token (<|endoftext|>)
+    projector_hidden_act: ActivationFunctionEnum = ActivationFunctionEnum.gelu
+    vision_feature_select_strategy: str = "full"
+    vision_feature_layer: Union[int, List[int]] = -1
+    vision_aspect_ratio: str = "anyres_max_9"
+    image_grid_pinpoints: Optional[List[List[int]]] = None
+    multimodal_projector_bias: bool = True
+    gradient_checkpointing: bool = True
+
+    # Reference checkpoint for loading pretrained models
+    reference_checkpoint: Optional[str] = None
+    tokenizer: Optional[str] = None
+
+    def __post_init__(self):
+        if self.vision_encoder_type not in ["siglip", "siglip2"]:
+            raise ValueError(f"vision_encoder_type must be 'siglip' or 'siglip2', got {self.vision_encoder_type}")
+
+        if self.vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                f"vision_feature_select_strategy must be 'default' or 'full', got {self.vision_feature_select_strategy}"
+            )
+
+        # Set default image_grid_pinpoints if not provided
+        if self.image_grid_pinpoints is None:
+            # Default pinpoints for anyres_max_9 strategy
+            object.__setattr__(
+                self,
+                "image_grid_pinpoints",
+                [
+                    [384, 384],
+                    [384, 768],
+                    [384, 1152],
+                    [768, 384],
+                    [768, 768],
+                    [768, 1152],
+                    [1152, 384],
+                    [1152, 768],
+                    [1152, 1152],
+                ],
+            )
+
+    @property
+    def vocab_size(self) -> int:
+        """Return vocab_size from text_config for compatibility with HFCheckpointConverter.load_pretrained()."""
+        return self.text_config.vocab_size
+
+    @property
+    def model_type(self) -> Type["LlavaOnevisionModel"]:
+        """Return the model class type."""
+        return LlavaOnevisionModel
+
+    def hf_checkpoint_converter(
+        self, ref_checkpoint: Optional[str] = None
+    ) -> HFCheckpointConverter["LlavaOnevisionConfig"]:  # type: ignore
+        """Create HuggingFace checkpoint converter for this config."""
+        return HFCheckpointConverter(
+            self.__class__,
+            reference_checkpoint=self.reference_checkpoint if ref_checkpoint is None else ref_checkpoint,
+            trust_remote_code=True,
+            tokenizer=ref_checkpoint if self.tokenizer is None else self.tokenizer,
+            HfConfigClass=HfLlavaOnevisionConfig,
+        )
+
+    @classmethod
+    def from_hf_config(cls, hf_config: HfConfig) -> "LlavaOnevisionConfig":
+        """Convert from HuggingFace config to Levanter config."""
+        # Detect vision encoder type from HF config
+        # Check if it's Siglip2 by looking for num_patches attribute
+        is_siglip2 = hasattr(hf_config.vision_config, "num_patches")
+        vision_encoder_type = "siglip2" if is_siglip2 else "siglip"
+
+        # Extract vision and text configs based on type
+        if vision_encoder_type == "siglip2":
+            vision_config = Siglip2VisionConfig.from_hf_config(hf_config.vision_config)
+        else:
+            vision_config = SiglipVisionConfig.from_hf_config(hf_config.vision_config)
+
+        # Ensure no_bias attribute exists (Qwen2 default is True, meaning use_bias=False)
+        if not hasattr(hf_config.text_config, "no_bias"):
+            hf_config.text_config.no_bias = True
+
+        text_config = QwenConfig.from_hf_config(hf_config.text_config)
+
+        # Parse activation function
+        act_map = {
+            "gelu": ActivationFunctionEnum.gelu,
+            "gelu_new": ActivationFunctionEnum.gelu_new,
+            "relu": ActivationFunctionEnum.relu,
+            "silu": ActivationFunctionEnum.silu,
+        }
+        activation_fn = act_map.get(hf_config.projector_hidden_act, ActivationFunctionEnum.gelu)
+
+        return cls(
+            vision_config=vision_config,
+            text_config=text_config,
+            vision_encoder_type=vision_encoder_type,
+            image_token_index=hf_config.image_token_index,
+            video_token_index=hf_config.video_token_index,
+            projector_hidden_act=activation_fn,
+            vision_feature_select_strategy=hf_config.vision_feature_select_strategy,
+            vision_feature_layer=hf_config.vision_feature_layer,
+            vision_aspect_ratio=hf_config.vision_aspect_ratio,
+            image_grid_pinpoints=hf_config.image_grid_pinpoints,
+            multimodal_projector_bias=hf_config.multimodal_projector_bias,
+        )
+
+    def with_token_ids(
+        self,
+        image_token_id: Optional[int] = None,
+        video_token_id: Optional[int] = None,
+    ) -> "LlavaOnevisionConfig":
+        """Create a new config with updated token IDs.
+
+        Use this method to update the config when replacing the tokenizer with a different one
+        (e.g., Qwen3 tokenizer), which may assign different IDs to <image> and <video> tokens.
+
+        Args:
+            image_token_id: New token ID for <image> placeholder. If None, keeps current value.
+            video_token_id: New token ID for <video> placeholder. If None, keeps current value.
+
+        Returns:
+            New LlavaOnevisionConfig with updated token IDs.
+
+        Example:
+            >>> from levanter.data.image import BatchImageProcessor
+            >>> bp = BatchImageProcessor(processor, tokenizer=qwen3_tokenizer)
+            >>> token_ids = bp.get_token_ids()
+            >>> model_config = model_config.with_token_ids(
+            ...     image_token_id=token_ids["image_token_id"],
+            ...     video_token_id=token_ids["video_token_id"],
+            ... )
+        """
+        updates = {}
+        if image_token_id is not None:
+            updates["image_token_index"] = image_token_id
+        if video_token_id is not None:
+            updates["video_token_index"] = video_token_id
+
+        if updates:
+            return replace(self, **updates)
+        return self
+
+    def to_hf_config(self, vocab_size: int, config_overrides: Optional[Dict] = None) -> HfLlavaOnevisionConfig:
+        """Convert from Levanter config to HuggingFace config.
+
+        Args:
+            vocab_size: Vocabulary size for the text model
+            config_overrides: Optional config overrides
+        """
+        if config_overrides is None:
+            config_overrides = {}
+
+        # Convert vision and text configs
+        vision_hf_config = self.vision_config.to_hf_config()
+        text_hf_config = self.text_config.to_hf_config(vocab_size=vocab_size)
+
+        # Map activation function
+        if isinstance(self.projector_hidden_act, ActivationFunctionEnum):
+            projector_act = self.projector_hidden_act.value
+        else:
+            projector_act = self.projector_hidden_act
+
+        return HfLlavaOnevisionConfig(
+            vision_config=vision_hf_config.to_dict(),
+            text_config=text_hf_config.to_dict(),
+            image_token_index=self.image_token_index,
+            video_token_index=self.video_token_index,
+            projector_hidden_act=projector_act,
+            vision_feature_select_strategy=self.vision_feature_select_strategy,
+            vision_feature_layer=self.vision_feature_layer,
+            vision_aspect_ratio=self.vision_aspect_ratio,
+            image_grid_pinpoints=self.image_grid_pinpoints,
+            multimodal_projector_bias=self.multimodal_projector_bias,
+            **config_overrides,
+        )
+
+    # Axis definitions (implementing VlmConfig interface via duck typing)
+
+    @property
+    def Embed(self) -> Axis:
+        """Text embedding dimension."""
+        return self.text_config.Embed
+
+    @property
+    def max_Pos(self) -> Axis:
+        """Maximum position axis."""
+        return self.text_config.max_Pos
+
+    @property
+    def KeyPos(self) -> Axis:
+        """Key position axis."""
+        return self.text_config.KeyPos
+
+    @property
+    def VisionEmbed(self) -> Axis:
+        """Vision embedding dimension (renamed to avoid collision with text embed)."""
+        return Axis(name="vision_embed", size=self.vision_config.hidden_size)
+
+    @property
+    def TextEmbed(self) -> Axis:
+        """Text embedding dimension (same as Embed for compatibility)."""
+        return self.text_config.Embed
+
+    @property
+    def Pos(self) -> Axis:
+        """Maximum position axis (alias for max_Pos)."""
+        return self.text_config.max_Pos
+
+
+class LlavaOnevisionMultimodalProjector(eqx.Module):
+    """
+    Multimodal projector that maps vision features to text embedding space.
+
+    This is a simple MLP with one hidden layer:
+    vision_embed -> hidden -> text_embed
+    """
+
+    config: LlavaOnevisionConfig = eqx.field(static=True)
+    linear_1: hnn.Linear
+    act: Callable = eqx.field(static=True)
+    linear_2: hnn.Linear
+
+    @staticmethod
+    def init(config: LlavaOnevisionConfig, *, key) -> "LlavaOnevisionMultimodalProjector":
+        """Initialize the multimodal projector."""
+        k1, k2 = jrandom.split(key, 2)
+
+        VisionEmbed = config.VisionEmbed
+        TextEmbed = config.TextEmbed
+        # Create intermediate hidden axis for projector (same size as TextEmbed but different name)
+        # This avoids axis collision in linear_2 where In and Out would be the same
+        ProjectorHidden = Axis(name="projector_hidden", size=config.text_config.hidden_dim)
+        use_bias = config.multimodal_projector_bias
+
+        # First linear layer: vision_embed -> projector_hidden
+        linear_1 = hnn.Linear.init(In=VisionEmbed, Out=ProjectorHidden, key=k1, use_bias=use_bias, out_first=True)
+
+        # Activation function
+        if isinstance(config.projector_hidden_act, ActivationFunctionEnum):
+            act_fn = config.projector_hidden_act.to_fn()
+        else:
+            act_fn = config.projector_hidden_act
+
+        # Second linear layer: projector_hidden -> text_embed
+        linear_2 = hnn.Linear.init(In=ProjectorHidden, Out=TextEmbed, key=k2, use_bias=use_bias, out_first=True)
+
+        return LlavaOnevisionMultimodalProjector(config, linear_1, act_fn, linear_2)
+
+    @named_call
+    def __call__(self, image_features: NamedArray, *, key=None) -> NamedArray:
+        """
+        Project vision features to text embedding space.
+
+        Args:
+            image_features: Vision features with shape (..., vision_embed)
+            key: Optional PRNGKey for dropout (not used currently)
+
+        Returns:
+            Projected features with shape (..., text_embed)
+        """
+        k1, k2 = maybe_rng_split(key, 2)
+
+        # Rename vision embed axis to avoid collision with text embed axis
+        # Vision features come with "embed" axis from Siglip2, but we need to map it to text "embed" axis
+        # First, rename to a temporary unique name to avoid axis collision during projection
+        image_features = image_features.rename({"embed": "vision_embed"})
+
+        # First linear: vision_embed -> projector_hidden + activation
+        hidden = self.linear_1(image_features, key=k1)
+        hidden = self.act(hidden)
+
+        # Second linear: projector_hidden -> text_embed
+        # The output will have "embed" axis (from TextEmbed)
+        output = self.linear_2(hidden, key=k2)
+
+        return output
+
+
+class LlavaOnevisionModel(eqx.Module):
+    """
+    LLaVA OneVision model combining vision and language.
+
+    Architecture:
+    1. Vision encoder (Siglip2): Processes images
+    2. Multimodal projector: Maps vision features to text space
+    3. Language model (Qwen2/3): Generates text with vision context
+    """
+
+    config: LlavaOnevisionConfig = eqx.field(static=True)
+    vision_tower: Union[SiglipVisionModel, Siglip2VisionModel]
+    multi_modal_projector: LlavaOnevisionMultimodalProjector
+    language_model: QwenLMHeadModel
+
+    @staticmethod
+    def init(Vocab: Axis, config: LlavaOnevisionConfig, *, key) -> "LlavaOnevisionModel":
+        """Initialize LLaVA OneVision model."""
+        k_vision, k_proj, k_lm = jrandom.split(key, 3)
+
+        # Initialize vision tower based on encoder type
+        if config.vision_encoder_type == "siglip2":
+            vision_tower = Siglip2VisionModel.init(Vocab=Vocab, config=config.vision_config, key=k_vision)
+        elif config.vision_encoder_type == "siglip":
+            vision_tower = SiglipVisionModel.init(Vocab=Vocab, config=config.vision_config, key=k_vision)
+        else:
+            raise ValueError(f"Unsupported vision_encoder_type: {config.vision_encoder_type}")
+
+        # Initialize multimodal projector
+        multi_modal_projector = LlavaOnevisionMultimodalProjector.init(config=config, key=k_proj)
+
+        # Initialize language model (Qwen)
+        language_model = QwenLMHeadModel.init(Vocab=Vocab, config=config.text_config, key=k_lm)
+
+        return LlavaOnevisionModel(
+            config=config,
+            vision_tower=vision_tower,
+            multi_modal_projector=multi_modal_projector,
+            language_model=language_model,
+        )
+
+    @staticmethod
+    def _compute_position_ids(validity_mask: jnp.ndarray) -> jnp.ndarray:
+        """Compute compact position IDs from a validity mask using cumsum.
+
+        Args:
+            validity_mask: Boolean or int array where True/1 indicates valid positions.
+
+        Returns:
+            Position IDs where valid positions get incrementing IDs, invalid positions get 0.
+        """
+        position_ids = jnp.cumsum(validity_mask.astype(jnp.int32), axis=-1) - 1
+        return jnp.maximum(position_ids, 0)
+
+    @staticmethod
+    def _batch_gather(arrays: jnp.ndarray, indices: jnp.ndarray) -> jnp.ndarray:
+        """Gather elements from arrays using indices, batched over first dimension.
+
+        Args:
+            arrays: Array of shape (batch, seq, ...) to gather from.
+            indices: Array of shape (batch, num_indices) specifying indices to gather.
+
+        Returns:
+            Gathered array of shape (batch, num_indices, ...).
+        """
+        return jax.vmap(lambda arr, idx: arr[idx])(arrays, indices)
+
+    @property
+    def Vocab(self) -> Axis:
+        """Get the vocabulary axis from the language model."""
+        return self.language_model.Vocab
+
+    def get_input_embeddings(self) -> hnn.Embedding:
+        """Get the input embeddings from the language model."""
+        return self.language_model.embeddings.token_embeddings
+
+    def get_image_features(
+        self,
+        pixel_values: NamedArray,
+        grid_mask: NamedArray,
+        *,
+        key=None,
+    ) -> Tuple[NamedArray, NamedArray]:
+        """
+        Extract image features with fixed-shape processing for JIT compatibility.
+
+        This implementation processes all patches (including padding) through the vision tower,
+        then applies feature scrubbing to zero out invalid patches based on grid_mask.
+
+        Args:
+            pixel_values: Fixed-shape patches (batch, TOTAL_PATCHES, C, H, W) - padded to max patches
+            grid_mask: Boolean mask indicating valid patches (batch, TOTAL_PATCHES)
+            key: Optional PRNGKey
+
+        Returns:
+            image_features: (batch, TOTAL_PATCHES, features_per_patch, embed) - with padding zeroed out
+            grid_mask: (batch, TOTAL_PATCHES) - passed through for later use
+        """
+        k_vision, k_proj = maybe_rng_split(key, 2)
+
+        vision_feature_layer = self.config.vision_feature_layer
+        vision_feature_select_strategy = self.config.vision_feature_select_strategy
+
+        # Only 5D input supported: (batch, num_patches, channels, height, width)
+        if len(pixel_values.axes) != 5:
+            raise ValueError(f"Expected 5D pixel_values (batch, num_patches, C, H, W), got {pixel_values.axes}")
+
+        batch_ax, num_patches_ax, *_ = pixel_values.axes
+
+        # Flatten batch and patches for vision tower: (batch * TOTAL_PATCHES, C, H, W)
+        total_images = batch_ax.size * num_patches_ax.size
+        VisionBatch = Axis("vision_batch", total_images)
+        pixel_values_flat = hax.flatten_axes(pixel_values, (batch_ax, num_patches_ax), VisionBatch)
+
+        # Run vision tower on all patches (including padding patches)
+        image_outputs = self.vision_tower(pixel_values_flat, output_hidden_states=True, key=k_vision)
+        if image_outputs.hidden_states is None:
+            raise ValueError("Vision tower must return hidden states when output_hidden_states=True")
+
+        # Select features from specified layer(s)
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        else:
+            # Concatenate features from multiple layers
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            selected_image_feature = hax.concatenate(self.config.VisionEmbed, hs_pool)
+
+        # Apply feature selection strategy: "default" skips first token (CLS), "full" keeps all
+        if vision_feature_select_strategy == "default":
+            # Skip first token (CLS token) - slice from num_patches 1 onwards
+            patch_axis = selected_image_feature.resolve_axis("num_patches")
+            selected_image_feature = hax.slice(
+                selected_image_feature, {"num_patches": 1}, {"num_patches": patch_axis.size - 1}
+            )
+
+        # Project to text embedding space
+        # selected_image_feature shape: (vision_batch, num_patches, embed)
+        image_features = self.multi_modal_projector(selected_image_feature, key=k_proj)
+
+        # Rename "num_patches" axis to "features_per_patch" to avoid collision after unflatten
+        # siglip outputs (vision_batch, num_patches, embed) where num_patches = patches per image
+        # After unflatten, we'll have (batch, num_patches, features_per_patch, embed)
+        image_features = image_features.rename({"num_patches": "features_per_patch"})
+
+        # Unflatten vision_batch back to (batch, num_patches)
+        image_features = image_features.unflatten_axis("vision_batch", (batch_ax, num_patches_ax))
+        # Now shape: (batch, num_patches, features_per_patch, embed)
+
+        # === FEATURE SCRUBBING ===
+        # Zero out padding patches: features = features * mask
+        # Broadcast mask to match feature dimensions
+        # Create a mask with shape (batch, num_patches) and broadcast to (batch, num_patches, features_per_patch, embed)
+        mask_expanded = grid_mask.astype(jnp.float32)
+
+        # Apply mask - this zeros out all features for padding patches
+        image_features_array = image_features.array * mask_expanded.array[:, :, None, None]
+        image_features = hax.named(image_features_array, image_features.axes)
+
+        return image_features, grid_mask
+
+    def get_placeholder_mask(self, input_ids: NamedArray, image_features: Optional[NamedArray] = None):
+        """
+        Get mask for placeholder tokens (image/video tokens) in the input.
+
+        Args:
+            input_ids: Input token IDs with shape (batch, seq_len)
+            image_features: Image features with shape (total_patches, embed) or
+                           (total_patches, features_per_patch, embed)
+
+        Returns:
+            special_image_mask: Boolean mask with shape (batch, seq_len)
+        """
+        # Find positions where input_ids == image_token_index
+        special_image_mask = input_ids == self.config.image_token_index
+
+        # Note: Token count validation is done outside JIT context when needed
+        # During JIT compilation, we cannot use concrete values from traced arrays
+        # The validation is performed in non-JIT contexts (e.g., tests)
+
+        return special_image_mask
+
+    def validate_placeholder_mask(self, input_ids: NamedArray, image_features: Optional[NamedArray] = None):
+        """
+        Validate that image token count matches feature count. Call outside JIT context.
+
+        Args:
+            input_ids: Input token IDs with shape (batch, seq_len)
+            image_features: Image features with shape (total_patches, embed) or
+                           (total_patches, features_per_patch, embed)
+
+        Raises:
+            ValueError: If token count doesn't match feature count
+        """
+        if image_features is None:
+            return
+
+        special_image_mask = input_ids == self.config.image_token_index
+        n_image_tokens = int(jnp.sum(special_image_mask.array))
+
+        # Get total feature count from image_features
+        if len(image_features.axes) == 2:
+            n_features = image_features.axes[0].size
+        else:
+            # For (total_patches, features_per_patch, embed), total features = patches * features_per_patch
+            n_features = image_features.axes[0].size * image_features.axes[1].size
+
+        if n_image_tokens != n_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_features}"
+            )
+
+        return special_image_mask
+
+    def forward_with_activations(
+        self,
+        input_ids: NamedArray,
+        pixel_values: Optional[NamedArray] = None,
+        grid_mask: Optional[NamedArray] = None,
+        unpad_indices: Optional[NamedArray] = None,
+        inputs_embeds: Optional[NamedArray] = None,
+        *,
+        key=None,
+    ) -> Tuple[NamedArray, NamedArray]:
+        """
+        Forward pass returning activations and lm_head for blockwise loss computation.
+
+        This avoids materializing the full logits tensor (batch * seq * vocab),
+        which can cause OOM for large vocab sizes.
+
+        Args:
+            input_ids: Text token IDs with shape (batch, seq_len)
+            pixel_values: Fixed-shape image patches (batch, TOTAL_PATCHES, C, H, W)
+                         Padded to max_patches + 1 (base patch + highres patches)
+            grid_mask: Boolean mask indicating valid patches (batch, TOTAL_PATCHES)
+                      True for actual image patches, False for padding
+            unpad_indices: Pre-computed indices to reorder features to HF's unpadded order
+                          (batch, num_image_tokens) - maps HF position to Levanter index
+            inputs_embeds: Optional pre-computed embeddings (batch, seq_len, embed)
+            key: Optional PRNGKey
+
+        Returns:
+            Tuple of (activations, lm_head) for use with fused_cross_entropy_loss.
+        """
+        k_vision, k_lm = maybe_rng_split(key, 2)
+
+        # Merge text embeddings with image features and compute position IDs
+        inputs_embeds, position_ids, validity_mask = self._merge_embeddings(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            grid_mask=grid_mask,
+            unpad_indices=unpad_indices,
+            key=k_vision,
+        )
+
+        # Forward through language model with merged embeddings
+        # Create attention mask: causal + segment-based padding mask
+        # validity_mask is (batch, seq) with True for valid, False for invalid
+        # Use segment_ids instead of explicit_mask for splash attention compatibility
+        # Valid tokens get segment_id=1, padding tokens get segment_id=0
+        # Splash attention prevents attention between different segments
+        segment_ids = validity_mask.astype(jnp.int32)
+        attn_mask = AttentionMask.causal().with_segment_ids(segment_ids)
+
+        activations = self.language_model.transformer(
+            inputs_embeds, attn_mask=attn_mask, pos_ids=position_ids, key=k_lm
+        )
+
+        # Return activations and lm_head for blockwise loss computation
+        lm_head = self.language_model.get_lm_head()
+        return activations, lm_head
+
+    @named_call
+    def __call__(
+        self,
+        input_ids: NamedArray,
+        pixel_values: Optional[NamedArray] = None,
+        grid_mask: Optional[NamedArray] = None,
+        unpad_indices: Optional[NamedArray] = None,
+        inputs_embeds: Optional[NamedArray] = None,
+        *,
+        key=None,
+    ) -> NamedArray:
+        """
+        Forward pass through LLaVA OneVision with fixed-shape processing.
+
+        Args:
+            input_ids: Text token IDs with shape (batch, seq_len)
+            pixel_values: Fixed-shape image patches (batch, TOTAL_PATCHES, C, H, W)
+                         Padded to max_patches + 1 (base patch + highres patches)
+            grid_mask: Boolean mask indicating valid patches (batch, TOTAL_PATCHES)
+                      True for actual image patches, False for padding
+            unpad_indices: Pre-computed indices to reorder features to HF's unpadded order
+                          (batch, num_image_tokens) - maps HF position to Levanter index
+            inputs_embeds: Optional pre-computed embeddings (batch, seq_len, embed)
+            key: Optional PRNGKey
+
+        Returns:
+            Logits with shape (batch, seq_len, vocab)
+        """
+        activations, lm_head = self.forward_with_activations(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            grid_mask=grid_mask,
+            unpad_indices=unpad_indices,
+            inputs_embeds=inputs_embeds,
+            key=key,
+        )
+        return hax.dot(activations, lm_head, axis=self.config.TextEmbed)
+
+    def _merge_embeddings(
+        self,
+        input_ids: NamedArray,
+        inputs_embeds: Optional[NamedArray],
+        pixel_values: Optional[NamedArray],
+        grid_mask: Optional[NamedArray],
+        unpad_indices: Optional[NamedArray] = None,
+        *,
+        key=None,
+    ) -> Tuple[NamedArray, NamedArray, NamedArray]:
+        """
+        Merge text embeddings with projected image features and compute position IDs.
+
+        This function:
+        1. Gets image features via get_image_features() (fixed-shape with mask)
+        2. Flattens image features to (batch, TOTAL_PATCHES * features_per_patch, embed)
+        3. If unpad_indices provided, reorders features to HF's unpadded spatial order
+        4. Merges image features into text embeddings at placeholder positions
+        5. Computes compact position IDs that skip padding using cumsum
+
+        Args:
+            input_ids: Text token IDs (batch, seq_len) - used to derive text validity mask
+            inputs_embeds: Optional pre-computed text embeddings
+            pixel_values: Fixed-shape patches (batch, TOTAL_PATCHES, C, H, W)
+            grid_mask: Boolean mask for valid patches (batch, TOTAL_PATCHES)
+            unpad_indices: Pre-computed indices to reorder features to HF's unpadded order
+                          (batch, num_image_tokens) - if None, uses sequential ordering
+            key: Optional PRNGKey
+
+        Returns:
+            Tuple of:
+            - merged_embeds: (batch, seq_len, embed) with image features at placeholders
+            - position_ids: (batch, seq_len) compact position IDs skipping padding
+            - validity_mask: (batch, seq_len) boolean mask for valid tokens (for attention masking)
+        """
+        if inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError("input_ids are required when inputs_embeds is None.")
+            inputs_embeds = self.get_input_embeddings().embed(input_ids)
+
+        batch_ax = inputs_embeds.axes[0]
+        seq_ax = inputs_embeds.axes[1]
+        embed_ax = inputs_embeds.axes[2]
+
+        # Text validity mask: valid text tokens (not padding)
+        text_mask = (input_ids != self.config.pad_token_id).astype(jnp.int32)
+
+        if pixel_values is None:
+            # No images - just return text embeddings with text-only position IDs
+            position_ids_array = self._compute_position_ids(text_mask.array)
+            Pos = Axis("position", seq_ax.size)
+            position_ids = hax.named(position_ids_array, (batch_ax, Pos))
+            # Return text_mask as validity mask (already a NamedArray)
+            validity_mask = text_mask.astype(jnp.bool_)
+            return inputs_embeds, position_ids, validity_mask
+
+        # Get image features: (batch, TOTAL_PATCHES, features_per_patch, embed)
+        image_features, grid_mask = self.get_image_features(
+            pixel_values=pixel_values,
+            grid_mask=grid_mask,
+            key=key,
+        )
+
+        # Get dimensions
+        num_patches_ax = image_features.axes[1]
+        features_per_patch_ax = image_features.axes[2]
+        features_per_patch = features_per_patch_ax.size
+        total_patches = num_patches_ax.size
+
+        # Flatten image features to (batch, total_image_tokens, embed)
+        # where total_image_tokens = TOTAL_PATCHES * features_per_patch
+        total_image_tokens = total_patches * features_per_patch
+        ImageTokens = Axis("image_tokens", total_image_tokens)
+        image_features_flat = hax.flatten_axes(image_features, (num_patches_ax, features_per_patch_ax), ImageTokens)
+
+        # If unpad_indices provided, reorder features to HF's unpadded spatial order
+        # unpad_indices: (batch, num_image_tokens) where num_image_tokens is the unpadded count
+        # unpad_indices[i] = Levanter index for HF position i
+        if unpad_indices is not None:
+            # Get the number of unpadded image tokens from unpad_indices shape
+            num_unpadded_tokens = unpad_indices.axis_size("num_image_tokens")
+
+            # Gather features in HF's unpadded order
+            image_features_reordered = self._batch_gather(image_features_flat.array, unpad_indices.array)
+            # Now image_features_reordered: (batch, num_unpadded_tokens, embed) in HF order
+            UnpaddedTokens = Axis("image_tokens", num_unpadded_tokens)
+            image_features_flat = hax.named(image_features_reordered, (batch_ax, UnpaddedTokens, embed_ax))
+            # Update total_image_tokens to reflect the unpadded count
+            total_image_tokens = num_unpadded_tokens
+
+        # Get placeholder mask: where image tokens should be inserted
+        special_image_mask = self.get_placeholder_mask(input_ids)
+
+        # Compute gather indices: for each placeholder, which image token to gather
+        def compute_indices(mask):
+            return (jnp.cumsum(mask.astype(jnp.int32)) - 1) % total_image_tokens
+
+        all_indices = jax.vmap(compute_indices)(special_image_mask.array)
+
+        # Gather image features and merge with text embeddings
+        gathered = self._batch_gather(image_features_flat.array, all_indices)
+        merged = jnp.where(special_image_mask.array[:, :, None], gathered, inputs_embeds.array)
+        merged_embeds = hax.named(merged, inputs_embeds.axes)
+
+        # === POSITION ID COMPUTATION ===
+        # Combined validity mask: valid text OR valid image at placeholder positions
+        if unpad_indices is not None:
+            # When unpad_indices is provided, all image tokens are valid (they're the unpadded ones)
+            combined_mask = jnp.where(special_image_mask.array, 1, text_mask.array).astype(jnp.int32)
+        else:
+            # Need to check grid_mask validity for each placeholder position
+            grid_mask_expanded = jnp.repeat(grid_mask.array, features_per_patch, axis=1)
+            image_token_indices = jnp.cumsum(special_image_mask.array.astype(jnp.int32), axis=-1) - 1
+            image_token_indices = jnp.clip(image_token_indices, 0, total_image_tokens - 1)
+            image_validity = self._batch_gather(grid_mask_expanded, image_token_indices)
+            combined_mask = jnp.where(special_image_mask.array, image_validity, text_mask.array).astype(jnp.int32)
+
+        # Compute compact position IDs
+        position_ids_array = self._compute_position_ids(combined_mask)
+
+        Pos = Axis("position", seq_ax.size)
+        position_ids = hax.named(position_ids_array, (batch_ax, Pos))
+
+        # Return validity mask for attention masking
+        validity_mask = hax.named(combined_mask.astype(jnp.bool_), (batch_ax, seq_ax))
+
+        return merged_embeds, position_ids, validity_mask
+
+    def initial_cache(self, spec, *, dtype):
+        """Creates an initial paged KV cache for the language model."""
+        tc = self.config.text_config
+        kv_heads = Axis("kv_head", tc.num_kv_heads)
+        head_size = Axis("head_size", tc.hidden_dim // tc.num_heads)
+        caches = [KvPageCache.init(spec, kv_heads, head_size, dtype=dtype) for _ in range(tc.num_layers)]
+        return ListCache(caches)
+
+    def decode(
+        self,
+        embeds: NamedArray | None,
+        kv_cache,
+        batch_info,
+        pos_ids: NamedArray,
+        *,
+        input_ids: NamedArray | None = None,
+        pixel_values: Optional[NamedArray] = None,
+        grid_mask: Optional[NamedArray] = None,
+        unpad_indices: Optional[NamedArray] = None,
+        key=None,
+    ):
+        """Paged decode/prefill using paged KV cache.
+
+        Args:
+            embeds: Pre-computed embeddings, or None to compute from input_ids
+            kv_cache: KV cache for paged attention
+            batch_info: Batch information for paged attention
+            pos_ids: Position IDs (will be overwritten if embeds is None and images present)
+            input_ids: Input token IDs (required if embeds is None)
+            pixel_values: Fixed-shape image patches (batch, TOTAL_PATCHES, C, H, W)
+            grid_mask: Boolean mask indicating valid patches (batch, TOTAL_PATCHES)
+            unpad_indices: Pre-computed indices to reorder features to HF's unpadded order
+            key: Optional PRNGKey
+
+        Returns:
+            Tuple of (logits, updated_kv_cache)
+        """
+        k_vision, key = maybe_rng_split(key, 2) if key is not None and pixel_values is not None else (None, key)
+
+        if embeds is None:
+            if input_ids is None:
+                raise ValueError("When embeds is None, input_ids is required.")
+            embeds, pos_ids, _validity_mask = self._merge_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=None,
+                pixel_values=pixel_values,
+                grid_mask=grid_mask,
+                unpad_indices=unpad_indices,
+                key=k_vision,
+            )
+            # Note: For paged attention, validity masking is handled through batch_info
+            # which manages the KV cache pages and slot positions
+
+        transformer = self.language_model.transformer
+        num_layers = self.config.text_config.num_layers
+        keys = maybe_rng_split(key, num_layers) if key is not None else [None] * num_layers
+
+        # Ensure batch axis exists for paged_decode
+        need_batch = "batch" not in [ax.name for ax in embeds.axes]
+        if need_batch:
+            embeds = embeds.broadcast_axis(Axis("batch", 1))
+            pos_ids = pos_ids.broadcast_axis(Axis("batch", 1))
+
+        # Flatten batch+position when batch_size=1
+        batch_axis, pos_axis = embeds.resolve_axis("batch"), embeds.resolve_axis("position")
+        need_flatten = batch_axis.size == 1
+        if need_flatten:
+            embeds = embeds.flatten_axes(("batch", "position"), "position")
+            pos_ids = pos_ids.flatten_axes(("batch", "position"), "position")
+
+        x = embeds
+        updated_caches = []
+
+        for i in range(num_layers):
+            layer = hax.tree_util.tree_map(lambda l: l["layer", i], transformer.layers.stacked)
+
+            # Attention block
+            attn_out, cache = layer.self_attn.paged_decode(
+                layer.input_layernorm(x), list(kv_cache)[i], batch_info, pos_ids=pos_ids, key=keys[i]
+            )
+            x = x + attn_out
+
+            # MLP block
+            x = x + layer.mlp(layer.post_attention_layernorm(x), key=None)
+            updated_caches.append(cache)
+
+        x = transformer.norm(x)
+
+        # Restore shape if flattened
+        if need_flatten:
+            x = x.unflatten_axis("position", (batch_axis, pos_axis))
+
+        logits = hax.dot(x, self.language_model.get_lm_head(), axis=self.config.TextEmbed)
+        return logits, ListCache(updated_caches)
+
+
+@dataclass(frozen=True)
+class VLMRequest:
+    """A request for VLM generation that includes image data.
+
+    This extends the concept of Request to include vision-language model data
+    with fixed-shape processing for JIT compatibility.
+
+    Uses fixed-shape tensors with masks:
+    - pixel_values: (TOTAL_PATCHES, C, H, W) - padded to fixed size
+    - grid_mask: (TOTAL_PATCHES,) - True for valid patches, False for padding
+    - unpad_indices: (num_image_tokens,) - indices to reorder features to HF's unpadded order
+    """
+
+    prompt_tokens: list[int]
+    request_id: int
+    decode_params: "SeqDecodingParams"  # From levanter.inference.jit_scheduler
+    n_generations: int
+
+    # VLM-specific fields (fixed-shape for JIT compatibility)
+    pixel_values: NamedArray  # (TOTAL_PATCHES, C, H, W) - FIXED shape, padded
+    grid_mask: NamedArray  # (TOTAL_PATCHES,) - boolean mask for valid patches
+    input_ids: Optional[NamedArray] = None  # Full input_ids with image tokens
+    unpad_indices: Optional[NamedArray] = None  # Indices for HF-style feature ordering
+
+
+class _LlavaInferenceWrapper(eqx.Module):
+    """Adapter to run LlavaOnevisionModel through InferenceEngine.
+
+    This wrapper keeps the full LlavaOnevisionModel and dynamically computes
+    embeddings during prefill. It stores the current request's image data
+    (pixel_values, grid_mask, input_ids) and uses them during the prefill phase.
+
+    Uses fixed-shape processing for JIT compatibility:
+    - pixel_values: (TOTAL_PATCHES, C, H, W) - padded to fixed size
+    - grid_mask: (TOTAL_PATCHES,) - True for valid patches, False for padding
+
+    Usage:
+        # Create wrapper with the model
+        wrapper = _LlavaInferenceWrapper.create(
+            model=model,
+            Vocab=Vocab,
+            mesh=mesh,  # Optional: for sharding
+        )
+
+        # Option 1: Set image data for current request before generation
+        wrapper = wrapper.set_request_data(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            grid_mask=grid_mask,
+        )
+
+        # Option 2: Use VLMRequest with LlavaInferenceEngine (recommended)
+        engine = LlavaInferenceEngine.from_model_with_config(model=wrapper, ...)
+        vlm_request = VLMRequest(
+            prompt_tokens=...,
+            pixel_values=...,
+            grid_mask=...,
+            ...
+        )
+        result = engine.generate([vlm_request])
+    """
+
+    model: "LlavaOnevisionModel"
+    Vocab: Axis = eqx.field(static=True)
+    _text_config: "QwenConfig" = eqx.field(static=True)
+
+    # Request-specific data (set before each generation)
+    _input_ids: NamedArray | None = None
+    _pixel_values: NamedArray | None = None
+    _grid_mask: NamedArray | None = None
+    _unpad_indices: NamedArray | None = None
+
+    # Cached embeddings and position IDs (computed lazily during prefill)
+    _cached_embeds: NamedArray | None = None
+    _cached_pos_ids: NamedArray | None = None
+
+    @classmethod
+    def create(
+        cls,
+        model: "LlavaOnevisionModel",
+        Vocab: Axis,
+        mesh=None,
+    ) -> "_LlavaInferenceWrapper":
+        """Create a wrapper from a LlavaOnevisionModel.
+
+        Args:
+            model: The full LlavaOnevisionModel
+            Vocab: Vocabulary axis
+            mesh: Optional JAX mesh for sharding
+
+        Returns:
+            A wrapper that holds the full model
+        """
+
+        # Note: We no longer force replication here.
+        # The model should already be properly sharded (FSDP-style with embed on data axis).
+        # Forcing replication would cause OOM for large models.
+        # If the model is already on the correct mesh with proper sharding, we keep it as-is.
+        # The InferenceEngine's named_jit will handle the compute-time sharding appropriately.
+
+        return cls(
+            model=model,
+            Vocab=Vocab,
+            _text_config=model.config.text_config,
+        )
+
+    def set_request_data(
+        self,
+        input_ids: NamedArray,
+        pixel_values: NamedArray,
+        grid_mask: NamedArray,
+        unpad_indices: Optional[NamedArray] = None,
+    ) -> "_LlavaInferenceWrapper":
+        """Set the image data for the current request.
+
+        This must be called before generating with InferenceEngine.
+
+        Args:
+            input_ids: Input token IDs with shape (batch, position)
+            pixel_values: Fixed-shape pixel values (TOTAL_PATCHES, C, H, W)
+            grid_mask: Boolean mask indicating valid patches (TOTAL_PATCHES,)
+            unpad_indices: Pre-computed indices to reorder features to HF's unpadded order
+
+        Returns:
+            A new wrapper with the request data set
+        """
+        # Create a new instance with updated request data
+        return _LlavaInferenceWrapper(
+            model=self.model,
+            Vocab=self.Vocab,
+            _text_config=self._text_config,
+            _input_ids=input_ids,
+            _pixel_values=pixel_values,
+            _grid_mask=grid_mask,
+            _unpad_indices=unpad_indices,
+            _cached_embeds=None,
+            _cached_pos_ids=None,
+        )
+
+    def _compute_embeddings(self) -> Tuple[NamedArray, NamedArray]:
+        """Compute merged embeddings and position IDs for the current request.
+
+        Returns:
+            Tuple of (merged_embeds, position_ids)
+        """
+        if self._input_ids is None or self._pixel_values is None:
+            raise ValueError("Request data not set. Call set_request_data() before generation.")
+
+        # Use empty axis_mapping to avoid auto_sharding issues with
+        # vision encoder's intermediate tensors (e.g., 31 patches not divisible by 4)
+        with hax.axis_mapping({}):
+            merged_embeds, position_ids, _ = self.model._merge_embeddings(
+                input_ids=self._input_ids,
+                inputs_embeds=None,
+                pixel_values=self._pixel_values,
+                grid_mask=self._grid_mask,
+                unpad_indices=self._unpad_indices,
+                key=None,
+            )
+
+        # Squeeze batch axis since InferenceEngine expects no batch axis
+        if "batch" in [ax.name for ax in merged_embeds.axes]:
+            merged_embeds = merged_embeds["batch", 0]
+            position_ids = position_ids["batch", 0]
+
+        return merged_embeds, position_ids
+
+    def initial_cache(self, spec, *, dtype):
+        """Creates an initial paged KV cache for the language model."""
+        return self.model.initial_cache(spec, dtype=dtype)
+
+    @property
+    def Pos(self):
+        """Return the position axis based on input_ids."""
+        if self._input_ids is not None:
+            return self._input_ids.resolve_axis("position")
+        raise ValueError("Request data not set. Call set_request_data() first.")
+
+    @property
+    def language_model(self):
+        """Access the underlying language model."""
+        return self.model.language_model
+
+    def decode(self, tokens, kv_cache, batch_info, pos_ids):
+        """Decode using dynamically computed embeddings for prefill, language model for decode."""
+        is_prefill = tokens.axis_size("position") > 1
+        lm = self.model.language_model
+
+        if is_prefill:
+            # Use position IDs from _compute_embeddings for proper RoPE with padding
+            embeds, computed_pos_ids = self._compute_embeddings()
+            pos_ids = computed_pos_ids
+        else:
+            embeds = lm.embeddings.embed(tokens)
+
+        x, new_cache = lm.transformer.decode(kv_cache, embeds, batch_info, pos_ids, key=None)
+        logits = lm.lm_head(x, key=None) if lm.lm_head is not None else lm.embeddings.unembed(x)
+        return logits, new_cache
+
+
+class LlavaInferenceEngine:
+    """InferenceEngine for LlavaOnevision that handles VLMRequest.
+
+    This engine wraps a standard InferenceEngine and extracts VLM-specific
+    data (pixel_values, image_sizes, etc.) from VLMRequest objects before
+    generation.
+
+    Usage:
+        # Create engine
+        engine = LlavaInferenceEngine.from_model_with_config(
+            model=lev_model,
+            tokenizer=processor.tokenizer,
+            config=engine_config,
+            Vocab=Vocab,
+            mesh=mesh,
+        )
+
+        # Create VLM request (with fixed-shape tensors)
+        vlm_request = VLMRequest(
+            prompt_tokens=prompt_tokens,
+            request_id=0,
+            decode_params=decode_params,
+            n_generations=1,
+            pixel_values=pixel_values,  # (TOTAL_PATCHES, C, H, W) - padded
+            grid_mask=grid_mask,  # (TOTAL_PATCHES,) - boolean
+            input_ids=input_ids,
+        )
+
+        # Generate
+        result = engine.generate([vlm_request])
+    """
+
+    def __init__(
+        self,
+        wrapper: _LlavaInferenceWrapper,
+        base_engine,  # InferenceEngine
+    ):
+        """Initialize with a wrapper and base engine.
+
+        Args:
+            wrapper: The _LlavaInferenceWrapper (without request data set)
+            base_engine: The underlying InferenceEngine
+        """
+        self._wrapper = wrapper
+        self._base_engine = base_engine
+
+    @classmethod
+    def from_model_with_config(
+        cls,
+        model: "LlavaOnevisionModel",
+        tokenizer,
+        config,  # InferenceEngineConfig
+        Vocab: Axis,
+        mesh=None,
+    ) -> "LlavaInferenceEngine":
+        """Build a LlavaInferenceEngine from a model and config.
+
+        Args:
+            model: The LlavaOnevisionModel
+            tokenizer: Tokenizer with encode/decode methods
+            config: InferenceEngineConfig for sizing
+            Vocab: Vocabulary axis
+            mesh: Optional JAX mesh for sharding
+
+        Returns:
+            A LlavaInferenceEngine ready for generation
+        """
+        # Create the wrapper
+        wrapper = _LlavaInferenceWrapper.create(
+            model=model,
+            Vocab=Vocab,
+            mesh=mesh,
+        )
+
+        # Create the base engine with the wrapper
+        base_engine = InferenceEngine.from_model_with_config(
+            model=wrapper,
+            tokenizer=tokenizer,
+            config=config,
+        )
+
+        return cls(wrapper=wrapper, base_engine=base_engine)
+
+    def generate(self, requests: list[VLMRequest], step_callback=None):
+        """Generate tokens for a batch of VLMRequests.
+
+        This method:
+        1. Extracts VLM data from the first request
+        2. Sets the request data on the wrapper
+        3. Calls the base engine's generate method
+
+        Args:
+            requests: List of VLMRequest objects
+            step_callback: Optional callback for each decode iteration
+
+        Returns:
+            GenerationResult with tokens, logprobs, and total_generated
+        """
+        if not requests:
+            raise ValueError("At least one request is required")
+
+        # For now, we only support single-request generation for VLM
+        # (because the wrapper stores a single set of pixel_values)
+        if len(requests) > 1:
+            raise NotImplementedError(
+                "LlavaInferenceEngine currently only supports single-request generation. "
+                "Multi-request batching for VLM is not yet implemented."
+            )
+
+        vlm_request = requests[0]
+
+        # Set the VLM data on the wrapper
+        # We need to update the model in the base engine
+        self._base_engine.model = self._wrapper.set_request_data(
+            input_ids=vlm_request.input_ids,
+            pixel_values=vlm_request.pixel_values,
+            grid_mask=vlm_request.grid_mask,
+            unpad_indices=vlm_request.unpad_indices,
+        )
+
+        # Convert VLMRequest to standard Request for the base engine
+        standard_requests = [
+            Request(
+                prompt_tokens=r.prompt_tokens,
+                request_id=r.request_id,
+                decode_params=r.decode_params,
+                n_generations=r.n_generations,
+            )
+            for r in requests
+        ]
+
+        # Generate using the base engine
+        return self._base_engine.generate(standard_requests, step_callback=step_callback)
+
+    def reset(self):
+        """Reset the engine state."""
+        self._base_engine.reset()
+
+    @property
+    def config(self):
+        """Return the engine config."""
+        return self._base_engine.config
+
+
+__all__ = [
+    "LlavaOnevisionConfig",
+    "LlavaOnevisionMultimodalProjector",
+    "LlavaOnevisionModel",
+    "_LlavaInferenceWrapper",
+    "VLMRequest",
+    "LlavaInferenceEngine",
+]
diff --git a/lib/levanter/src/levanter/models/qwen.py b/lib/levanter/src/levanter/models/qwen.py
index 19ec265a3b..e13f351d97 100644
--- a/lib/levanter/src/levanter/models/qwen.py
+++ b/lib/levanter/src/levanter/models/qwen.py
@@ -161,6 +161,33 @@ def init(config: QwenConfig, *, key) -> "QwenDecoderLayer":
 
         return QwenDecoderLayer(config, attn, mlp, ln_1, ln_2)
 
+    def decode(
+        self,
+        x: NamedArray,
+        kv_cache,
+        batch_info,
+        pos_ids: NamedArray,
+        *,
+        key=None,
+    ):
+        """Paged decode for a single layer with KV cache."""
+
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+
+        # Self attention with paged KV cache
+        residual = x
+        x = self.input_layernorm(x)
+        attn_output, kv_cache = self.self_attn.paged_decode(x, kv_cache, batch_info, pos_ids=pos_ids, key=k_attn)
+        x = residual + attn_output
+
+        # MLP
+        residual = x
+        x = self.post_attention_layernorm(x)
+        mlp_output = self.mlp(x, key=k_mlp)
+        output = residual + mlp_output
+
+        return output, kv_cache
+
     @named_call
     def __call__(
         self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *, key=None, pos_ids: NamedArray | None = None
diff --git a/lib/levanter/src/levanter/models/siglip.py b/lib/levanter/src/levanter/models/siglip.py
new file mode 100644
index 0000000000..80345b01f3
--- /dev/null
+++ b/lib/levanter/src/levanter/models/siglip.py
@@ -0,0 +1,871 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple, Type
+from levanter.utils.activation import ActivationFunctionEnum
+from levanter.utils.logging import silence_transformer_nag
+from levanter.models.vlm_model import VisionEncoderConfig
+
+
+silence_transformer_nag()
+from transformers import PretrainedConfig as HfConfig  # noqa: E402
+from transformers import SiglipVisionConfig as HfSiglipVisionConfig  # noqa: E402
+
+import equinox as eqx  # noqa: E402
+import jax.numpy as jnp  # noqa: E402
+
+import haliax as hax  # noqa: E402
+import haliax.nn as hnn  # noqa: E402
+from haliax import Axis, NamedArray  # noqa: E402
+from haliax.jax_utils import maybe_rng_split, named_call, shaped_rng_split  # noqa: E402
+from haliax.nn.scan import Stacked  # noqa: E402
+from haliax.state_dict import ModuleWithStateDictSerialization  # noqa: E402
+
+from levanter.compat.hf_checkpoints import HFCheckpointConverter, ModelWithHfSerializationMixin  # noqa: E402
+from levanter.layers.attention import AttentionBackend, AttentionMask, dot_product_attention  # noqa: E402
+
+
+# =====================
+# SigLIP Vision Model Output
+# =====================
+
+
+@dataclass
+class SiglipVisionModelOutput:
+    """
+    Output class for SigLIP Vision Model, similar to HuggingFace's output format.
+
+    Args:
+        last_hidden_state: Final hidden states after post-layer normalization.
+            Shape: (batch, num_patches, embed)
+        hidden_states: Tuple of hidden states from each layer (including embeddings).
+            Each element has shape: (batch, num_patches, embed)
+            Only populated when output_hidden_states=True.
+    """
+
+    last_hidden_state: NamedArray
+    hidden_states: Optional[Tuple[NamedArray, ...]] = None
+
+
+@VisionEncoderConfig.register_subclass("siglip")
+@dataclass(frozen=True)
+class SiglipVisionConfig(VisionEncoderConfig["SiglipVisionModel"]):
+    """
+    Configuration class for SigLIP Vision Encoder (standard version, not Siglip2).
+
+    This configuration follows the Levanter patterns for model configs,
+    supporting HuggingFace checkpoint conversion and serialization.
+
+    Based on google/siglip-base-patch16-224 architecture.
+
+    Args:
+        hidden_size: Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size: Dimensionality of the "intermediate" (i.e., feed-forward) layer.
+        num_hidden_layers: Number of hidden layers in the Transformer encoder.
+        num_attention_heads: Number of attention heads for each attention layer.
+        num_channels: Number of channels in the input images.
+        image_size: The size (resolution) of each image.
+        patch_size: The size (resolution) of each patch.
+        hidden_act: The non-linear activation function.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        attention_dropout: The dropout ratio for the attention probabilities.
+        initializer_range: The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        gradient_checkpointing: Whether to use gradient checkpointing to save memory.
+        use_flash_attention: Whether to use flash attention.
+        attn_backend: Attention backend to use (if None, uses default).
+        flash_attention_block_size: Block size for flash attention.
+        inference: Whether to run in inference mode (disables dropout).
+    """
+
+    hidden_size: int = 768
+    intermediate_size: int = 3072
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 12
+    num_channels: int = 3
+    image_size: int = 224
+    patch_size: int = 16
+    hidden_act: ActivationFunctionEnum = ActivationFunctionEnum.gelu_new
+    layer_norm_eps: float = 1e-6
+    attention_dropout: float = 0.0
+    initializer_range: float = 0.02
+    gradient_checkpointing: bool = True
+
+    # Attention-related config
+    use_flash_attention: bool = True
+    attn_backend: Optional[AttentionBackend] = None
+    flash_attention_block_size: Optional[int] = None
+    inference: bool = True  # Whether to run in inference mode (disables dropout)
+
+    # Reference checkpoint for loading pretrained models
+    reference_checkpoint: Optional[str] = None
+
+    def hf_checkpoint_converter(
+        self, ref_checkpoint: Optional[str] = None
+    ) -> HFCheckpointConverter["SiglipVisionConfig"]:  # type: ignore
+        """Create HuggingFace checkpoint converter for this config."""
+        # Vision-only models don't have a tokenizer, but HFCheckpointConverter requires one
+        # Use gpt2 tokenizer as a placeholder since it's always available
+        return HFCheckpointConverter(
+            self.__class__,
+            reference_checkpoint=self.reference_checkpoint if ref_checkpoint is None else ref_checkpoint,
+            trust_remote_code=False,
+            tokenizer="gpt2",  # Dummy tokenizer for vision-only model
+            HfConfigClass=HfSiglipVisionConfig,
+        )
+
+    @classmethod
+    def from_hf_config(cls, hf_config: HfConfig) -> "SiglipVisionConfig":
+        """Convert from HuggingFace config to Levanter config.
+
+        Handles both:
+        - SiglipVisionConfig directly (has hidden_act attribute)
+        - SiglipConfig (has vision_config nested inside)
+        """
+        # If this is a full SiglipConfig, extract the vision_config
+        if hasattr(hf_config, "vision_config") and not hasattr(hf_config, "hidden_act"):
+            hf_config = hf_config.vision_config
+
+        # Extract activation function, handle both string and enum
+        hidden_act = hf_config.hidden_act
+        if isinstance(hidden_act, str):
+            # Map HF activation names to our enum
+            # Note: gelu_pytorch_tanh in HF maps to gelu_new in Levanter (approximate GELU)
+            if hidden_act == "gelu_pytorch_tanh":
+                activation_fn = ActivationFunctionEnum.gelu_new
+            elif hidden_act == "gelu":
+                activation_fn = ActivationFunctionEnum.gelu
+            elif hidden_act == "gelu_new":
+                activation_fn = ActivationFunctionEnum.gelu_new
+            elif hidden_act == "relu":
+                activation_fn = ActivationFunctionEnum.relu
+            elif hidden_act == "silu" or hidden_act == "swish":
+                activation_fn = ActivationFunctionEnum.silu
+            elif hidden_act == "quick_gelu":
+                activation_fn = ActivationFunctionEnum.quick_gelu
+            else:
+                # Default to gelu_new for unknown activations
+                activation_fn = ActivationFunctionEnum.gelu_new
+        else:
+            activation_fn = ActivationFunctionEnum.gelu_new
+
+        return cls(
+            hidden_size=hf_config.hidden_size,
+            intermediate_size=hf_config.intermediate_size,
+            num_hidden_layers=hf_config.num_hidden_layers,
+            num_attention_heads=hf_config.num_attention_heads,
+            num_channels=hf_config.num_channels,
+            image_size=hf_config.image_size,
+            patch_size=hf_config.patch_size,
+            hidden_act=activation_fn,
+            layer_norm_eps=hf_config.layer_norm_eps,
+            attention_dropout=hf_config.attention_dropout,
+        )
+
+    def to_hf_config(self, vocab_size: int = 1, config_overrides: Optional[Dict] = None) -> HfSiglipVisionConfig:
+        """Convert from Levanter config to HuggingFace config.
+
+        Args:
+            vocab_size: Vocabulary size (unused for vision-only models, but required by interface)
+            config_overrides: Optional config overrides
+        """
+        # vocab_size is not used for vision-only models, but required by the interface
+        if config_overrides is None:
+            config_overrides = {}
+
+        # Map activation function back to HF format
+        # gelu_new in Levanter maps back to gelu_pytorch_tanh in HF (for SigLIP compatibility)
+        if isinstance(self.hidden_act, ActivationFunctionEnum):
+            if self.hidden_act == ActivationFunctionEnum.gelu_new:
+                hf_hidden_act = "gelu_pytorch_tanh"
+            else:
+                hf_hidden_act = self.hidden_act.value
+        else:
+            hf_hidden_act = self.hidden_act
+
+        # Build config dict with defaults from self
+        config_dict = {
+            "hidden_size": self.hidden_size,
+            "intermediate_size": self.intermediate_size,
+            "num_hidden_layers": self.num_hidden_layers,
+            "num_attention_heads": self.num_attention_heads,
+            "num_channels": self.num_channels,
+            "image_size": self.image_size,
+            "patch_size": self.patch_size,
+            "hidden_act": hf_hidden_act,
+            "layer_norm_eps": self.layer_norm_eps,
+            "attention_dropout": self.attention_dropout,
+        }
+
+        # Apply overrides
+        config_dict.update(config_overrides)
+
+        hf_config = HfSiglipVisionConfig(**config_dict)
+
+        return hf_config
+
+    @property
+    def model_type(self) -> Type["SiglipVisionModel"]:
+        """Return the corresponding model class."""
+        return SiglipVisionModel
+
+    # Axis definitions following Levanter patterns
+    @property
+    def Embed(self) -> Axis:
+        """Embedding dimension axis."""
+        return Axis(name="embed", size=self.hidden_size)
+
+    @property
+    def Mlp(self) -> Axis:
+        """MLP intermediate dimension axis."""
+        return Axis(name="mlp", size=self.intermediate_size)
+
+    @property
+    def Heads(self) -> Axis:
+        """Number of attention heads axis."""
+        return Axis(name="heads", size=self.num_attention_heads)
+
+    @property
+    def HeadSize(self) -> Axis:
+        """Size of each attention head axis."""
+        return Axis(name="head_size", size=self.hidden_size // self.num_attention_heads)
+
+    @property
+    def Layers(self) -> Axis:
+        """Number of transformer layers axis."""
+        return Axis(name="layers", size=self.num_hidden_layers)
+
+    @property
+    def Channels(self) -> Axis:
+        """Number of image channels axis."""
+        return Axis(name="channels", size=self.num_channels)
+
+    @property
+    def ImageSize(self) -> Axis:
+        """Image size axis."""
+        return Axis(name="image_size", size=self.image_size)
+
+    @property
+    def PatchSize(self) -> Axis:
+        """Patch size axis."""
+        return Axis(name="patch_size", size=self.patch_size)
+
+    @property
+    def NumPatches(self) -> Axis:
+        """Number of patches axis (calculated from image_size and patch_size)."""
+        num_patches = (self.image_size // self.patch_size) ** 2
+        return Axis(name="num_patches", size=num_patches)
+
+
+# =====================
+# SigLIP MLP
+# =====================
+
+
+class SiglipMLP(eqx.Module):
+    """
+    MLP module for SigLIP Vision Transformer.
+
+    Implements a two-layer feedforward network with activation function in between.
+    """
+
+    fc1: hnn.Linear  # projection from Embed to Mlp (intermediate)
+    fc2: hnn.Linear  # projection from Mlp to Embed
+    act: Callable = eqx.field(static=True)
+
+    @staticmethod
+    def init(Embed: Axis, Mlp: Axis, activation_fn: ActivationFunctionEnum, *, key) -> "SiglipMLP":
+        """
+        Initialize SiglipMLP.
+
+        Args:
+            Embed: Embedding dimension axis
+            Mlp: MLP intermediate dimension axis
+            activation_fn: Activation function enum
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized SiglipMLP module
+        """
+        k_fc1, k_fc2 = maybe_rng_split(key, 2)
+
+        # In SigLIP, fc1 goes from hidden_size to intermediate_size
+        fc1 = hnn.Linear.init(In=Embed, Out=Mlp, key=k_fc1, use_bias=True, out_first=True)
+        # fc2 goes from intermediate_size back to hidden_size
+        fc2 = hnn.Linear.init(In=Mlp, Out=Embed, key=k_fc2, use_bias=True, out_first=True)
+
+        # Convert activation function enum to callable
+        activation_fn_callable = (
+            activation_fn.to_fn() if isinstance(activation_fn, ActivationFunctionEnum) else activation_fn
+        )
+
+        return SiglipMLP(fc1, fc2, activation_fn_callable)
+
+    @named_call
+    def __call__(self, x: NamedArray, *, key=None) -> NamedArray:
+        """
+        Forward pass through MLP.
+
+        Args:
+            x: Input tensor with Embed axis
+            key: Optional PRNGKey for dropout (not used in SigLIP)
+
+        Returns:
+            Output tensor with Embed axis
+        """
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+# =====================
+# SigLIP Attention
+# =====================
+
+
+class SiglipAttention(eqx.Module):
+    """
+    Multi-headed attention module for SigLIP.
+
+    Implements standard multi-head self-attention with separate Q, K, V projections
+    and an output projection.
+    """
+
+    config: SiglipVisionConfig = eqx.field(static=True)
+    q_proj: hnn.Linear  # Query projection from Embed to (Heads, HeadSize)
+    k_proj: hnn.Linear  # Key projection from Embed to (Heads, HeadSize)
+    v_proj: hnn.Linear  # Value projection from Embed to (Heads, HeadSize)
+    out_proj: hnn.Linear  # Output projection from (Heads, HeadSize) to Embed
+
+    @staticmethod
+    def init(config: SiglipVisionConfig, *, key) -> "SiglipAttention":
+        """
+        Initialize SiglipAttention.
+
+        Args:
+            config: SiglipVisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized SiglipAttention module
+        """
+        k_q, k_k, k_v, k_out = maybe_rng_split(key, 4)
+
+        Embed = config.Embed
+        Heads = config.Heads
+        HeadSize = config.HeadSize
+
+        # Initialize projection layers
+        # All projections use bias in SigLIP
+        q_proj = hnn.Linear.init(In=Embed, Out=(Heads, HeadSize), key=k_q, use_bias=True, out_first=True)
+        k_proj = hnn.Linear.init(In=Embed, Out=(Heads, HeadSize), key=k_k, use_bias=True, out_first=True)
+        v_proj = hnn.Linear.init(In=Embed, Out=(Heads, HeadSize), key=k_v, use_bias=True, out_first=True)
+        out_proj = hnn.Linear.init(In=(Heads, HeadSize), Out=Embed, key=k_out, use_bias=True, out_first=True)
+
+        return SiglipAttention(config, q_proj, k_proj, v_proj, out_proj)
+
+    @named_call
+    def __call__(
+        self,
+        x: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        *,
+        key=None,
+    ) -> NamedArray:
+        """
+        Forward pass through attention.
+
+        Args:
+            x: Input tensor with shape (..., seq, embed) where seq can be "num_patches" or "position"
+            mask: Optional attention mask
+            key: PRNGKey for dropout
+
+        Returns:
+            Output tensor with shape (..., seq, embed)
+        """
+        k_drop = maybe_rng_split(key, 1)[0] if key is not None else None
+
+        # Find the sequence axis (num_patches or position)
+        seq_axis_name = None
+        embed_axis = self.config.Embed
+        for ax in x.axes:
+            if ax.name in ("num_patches", "position"):
+                seq_axis_name = ax.name
+                break
+        if seq_axis_name is None:
+            # Fallback: find first axis that's not embed or batch
+            for ax in x.axes:
+                if ax != embed_axis and ax.name not in ("batch", "Batch"):
+                    seq_axis_name = ax.name
+                    break
+        if seq_axis_name is None:
+            raise ValueError(f"Could not find sequence axis in input {x.axes}")
+
+        # Project to Q, K, V
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        # Compute padding for Splash Attention (requires sequence length to be multiple of 128)
+        seq_axis = q.resolve_axis(seq_axis_name)
+        orig_seq_len = seq_axis.size
+        SPLASH_BLOCK_SIZE = 128
+        pad_len = (SPLASH_BLOCK_SIZE - (orig_seq_len % SPLASH_BLOCK_SIZE)) % SPLASH_BLOCK_SIZE
+
+        if pad_len > 0:
+            # Pad q, k, v with zeros along sequence dimension
+            q = hax.pad(q, {seq_axis_name: (0, pad_len)})
+            k = hax.pad(k, {seq_axis_name: (0, pad_len)})
+            v = hax.pad(v, {seq_axis_name: (0, pad_len)})
+
+            # Create padded axes
+            padded_seq_len = orig_seq_len + pad_len
+            PaddedSeqAxis = Axis(seq_axis_name, padded_seq_len)
+            PaddedKeyAxis = Axis("key_position", padded_seq_len)
+
+            # Create attention mask to ignore padded positions using numpy for static mask
+            # This allows Splash Attention to use NumpyMask which requires static arrays
+            import numpy as np
+
+            q_indices = np.arange(padded_seq_len)
+            k_indices = np.arange(padded_seq_len)
+            # Create 2D mask: valid positions are where both q and k indices < orig_seq_len
+            pad_mask_np = (q_indices[:, None] < orig_seq_len) & (k_indices[None, :] < orig_seq_len)
+            # Wrap as NamedArray for AttentionMask
+            pad_mask = hax.named(pad_mask_np, (PaddedSeqAxis, PaddedKeyAxis))
+
+            # Combine with existing mask if present
+            if mask is not None:
+                combined_mask = AttentionMask.explicit(pad_mask) & mask
+            else:
+                combined_mask = AttentionMask.explicit(pad_mask)
+
+            # Rename k and v's sequence axis to key_position
+            k = k.rename({seq_axis_name: "key_position"})
+            v = v.rename({seq_axis_name: "key_position"})
+        else:
+            # No padding needed
+            combined_mask = mask
+            k = k.rename({seq_axis_name: "key_position"})
+            v = v.rename({seq_axis_name: "key_position"})
+
+        attn_output = dot_product_attention(
+            seq_axis_name,
+            "key_position",
+            "head_size",
+            q,
+            k,
+            v,
+            mask=combined_mask,
+            inference=self.config.inference,
+            use_flash=self.config.use_flash_attention,
+            attn_backend=self.config.attn_backend,
+            flash_block_size=self.config.flash_attention_block_size,
+            dropout=self.config.attention_dropout,
+            prng=k_drop,
+            attention_dtype=x.dtype,
+        )
+
+        # Remove padding from output if we added it
+        if pad_len > 0:
+            attn_output = attn_output[seq_axis_name, :orig_seq_len]
+
+        # Project back to embedding dimension
+        return self.out_proj(attn_output.astype(x.dtype))
+
+
+# =====================
+# SigLIP Encoder Layer
+# =====================
+
+
+class SiglipEncoderLayer(eqx.Module):
+    """
+    SigLIP Encoder Layer.
+
+    Implements a transformer encoder layer with:
+    - Pre-LayerNorm architecture
+    - Self-attention with residual connection
+    - MLP with residual connection
+    """
+
+    config: SiglipVisionConfig = eqx.field(static=True)
+    layer_norm1: hnn.LayerNorm  # Pre-attention layer norm
+    self_attn: SiglipAttention  # Self-attention module
+    layer_norm2: hnn.LayerNorm  # Pre-MLP layer norm
+    mlp: SiglipMLP  # MLP module
+
+    @staticmethod
+    def init(config: SiglipVisionConfig, *, key) -> "SiglipEncoderLayer":
+        """
+        Initialize SiglipEncoderLayer.
+
+        Args:
+            config: SiglipVisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized SiglipEncoderLayer module
+        """
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+
+        # Initialize layer norms (with bias in SigLIP)
+        layer_norm1 = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+        layer_norm2 = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+
+        # Initialize attention and MLP
+        self_attn = SiglipAttention.init(config, key=k_attn)
+        mlp = SiglipMLP.init(config.Embed, config.Mlp, config.hidden_act, key=k_mlp)
+
+        return SiglipEncoderLayer(config, layer_norm1, self_attn, layer_norm2, mlp)
+
+    @named_call
+    def __call__(
+        self,
+        x: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        *,
+        key=None,
+    ) -> NamedArray:
+        """
+        Forward pass through encoder layer.
+
+        Args:
+            x: Input tensor with shape (..., position, embed)
+            mask: Optional attention mask
+            key: PRNGKey for dropout
+
+        Returns:
+            Output tensor with shape (..., position, embed)
+        """
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+
+        # Self-attention block with pre-norm and residual
+        residual = x
+        x_norm = self.layer_norm1(x)
+        attn_output = self.self_attn(x_norm, mask=mask, key=k_attn)
+        x = residual + attn_output
+
+        # MLP block with pre-norm and residual
+        residual = x
+        x_norm = self.layer_norm2(x)
+
+        mlp_output = self.mlp(x_norm, key=k_mlp)
+        x = residual + mlp_output
+
+        return x
+
+
+# =====================
+# SigLIP Vision Embeddings
+# =====================
+
+
+class SiglipVisionEmbeddings(eqx.Module):
+    """
+    Vision embeddings for SigLIP.
+
+    Converts images to patches using Conv2d and adds learnable position embeddings.
+    Unlike Siglip2 which uses patchified input, this module expects full images.
+    """
+
+    config: SiglipVisionConfig = eqx.field(static=True)
+    patch_embedding: hnn.Conv  # Conv2d for patch embedding
+    position_embedding: hnn.Embedding
+
+    @staticmethod
+    def init(config: SiglipVisionConfig, *, key) -> "SiglipVisionEmbeddings":
+        """
+        Initialize SiglipVisionEmbeddings.
+
+        Args:
+            config: SiglipVisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized SiglipVisionEmbeddings module
+        """
+        k_patch, k_pos = maybe_rng_split(key, 2)
+
+        # Patch embedding using Conv2d
+        # Input: (batch, channels, height, width)
+        # Output: (batch, embed_dim, num_patches_h, num_patches_w)
+        In_Channels = config.Channels
+        Out_Features = config.Embed
+        patch_size = config.patch_size
+
+        # Define spatial dimensions for the input image
+        Height = Axis("height", config.image_size)
+        Width = Axis("width", config.image_size)
+
+        patch_embedding = hnn.Conv.init(
+            Spatial=(Height, Width),
+            In=In_Channels,
+            Out=Out_Features,
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=0,
+            key=k_patch,
+            use_bias=True,
+        )
+
+        # Position embedding: learnable embeddings for each patch position
+        # For standard SigLIP, this is (num_patches,) where num_patches = (image_size // patch_size)^2
+        position_embedding = hnn.Embedding.init(
+            config.NumPatches,
+            config.Embed,
+            key=k_pos,
+        )
+
+        return SiglipVisionEmbeddings(config, patch_embedding, position_embedding)
+
+    @named_call
+    def __call__(self, pixel_values: NamedArray, *, key=None) -> NamedArray:
+        """
+        Forward pass through vision embeddings.
+
+        Args:
+            pixel_values: Input images with shape (batch, channels, height, width)
+            key: Optional PRNGKey
+
+        Returns:
+            Embeddings with position information added, shape (batch, num_patches, embed)
+        """
+        # Apply patch embeddings using Conv2d
+        # Input: (batch, channels, height, width)
+        # Output: (batch, embed, h_patches, w_patches)
+        patch_embeds = self.patch_embedding(pixel_values, key=key)
+
+        # Flatten spatial dimensions (h_patches, w_patches) into num_patches
+        # Conv output has spatial axes named after input spatial axes (height, width)
+        # with sizes divided by stride (patch_size)
+        embed_axis = self.config.Embed
+        spatial_axes = [ax for ax in patch_embeds.axes if ax.name in ("height", "width")]
+
+        # Flatten spatial dims into num_patches
+        num_patches_total = 1
+        for ax in spatial_axes:
+            num_patches_total *= ax.size
+        NumPatchesActual = Axis("num_patches", num_patches_total)
+
+        patch_embeds = hax.flatten_axes(patch_embeds, spatial_axes, NumPatchesActual)
+
+        # Rearrange to put num_patches before embed: (batch, embed, num_patches) -> (batch, num_patches, embed)
+        # Get all axes except embed and num_patches (i.e., batch if present), then num_patches, then embed
+        other_axes = [ax for ax in patch_embeds.axes if ax not in (embed_axis, NumPatchesActual)]
+        patch_embeds = patch_embeds.rearrange((*other_axes, NumPatchesActual, embed_axis))
+
+        # Add position embeddings
+        position_ids = hax.arange(NumPatchesActual)
+        pos_embeds = self.position_embedding(position_ids)
+
+        return patch_embeds + pos_embeds
+
+
+# =====================
+# SigLIP Vision Transformer
+# =====================
+
+
+class SiglipVisionTransformer(ModuleWithStateDictSerialization):
+    """
+    SigLIP Vision Transformer.
+
+    Complete vision encoder consisting of:
+    - Vision embeddings (patch + position)
+    - Stack of encoder layers
+    - Post-layer normalization
+    """
+
+    config: SiglipVisionConfig = eqx.field(static=True)
+    embeddings: SiglipVisionEmbeddings
+    layers: Stacked[SiglipEncoderLayer]
+    post_layernorm: hnn.LayerNorm
+
+    def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
+        """Map Levanter field names to HuggingFace state dict keys."""
+        return {"layers": "encoder.layers"}  # HF uses encoder.layers instead of layers
+
+    @staticmethod
+    def init(config: SiglipVisionConfig, *, key) -> "SiglipVisionTransformer":
+        """
+        Initialize SiglipVisionTransformer.
+
+        Args:
+            config: SiglipVisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized SiglipVisionTransformer module
+        """
+        k_embed, k_layers = maybe_rng_split(key, 2)
+
+        # Initialize embeddings
+        embeddings = SiglipVisionEmbeddings.init(config, key=k_embed)
+
+        # Initialize stacked encoder layers
+        layers = Stacked.init(
+            config.Layers,
+            SiglipEncoderLayer,
+            gradient_checkpointing=config.gradient_checkpointing,
+        )(config, key=shaped_rng_split(k_layers, config.num_hidden_layers))
+
+        # Post-encoder layer norm
+        post_layernorm = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+
+        return SiglipVisionTransformer(config, embeddings, layers, post_layernorm)
+
+    @named_call
+    def __call__(
+        self,
+        pixel_values: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        output_hidden_states: bool = False,
+        *,
+        key=None,
+    ) -> SiglipVisionModelOutput:
+        """
+        Forward pass through vision transformer.
+
+        Args:
+            pixel_values: Input images with shape (batch, channels, height, width)
+            mask: Optional attention mask
+            output_hidden_states: Whether to return hidden states from all layers.
+                If True, returns all layer hidden states.
+                If False, only returns the last hidden state (more efficient).
+            key: PRNGKey for dropout
+
+        Returns:
+            SiglipVisionModelOutput containing:
+                - last_hidden_state: Final encoded representations after post_layernorm (batch, num_patches, embed)
+                - hidden_states: Tuple of hidden states. When output_hidden_states=True, contains all layers.
+                    When output_hidden_states=False, contains only the last layer output (before post_layernorm)
+                    to support vision_feature_layer=-1 without collecting all intermediate states.
+        """
+        k_embed, k_layers = maybe_rng_split(key, 2)
+
+        # Get embeddings
+        hidden_states = self.embeddings(pixel_values, key=k_embed)
+
+        # Pass through encoder layers
+        keys = maybe_rng_split(k_layers, self.config.num_hidden_layers) if k_layers is not None else None
+
+        if output_hidden_states:
+            # Use scan_via to collect hidden states from each layer
+            all_hidden_states = [hidden_states]  # Start with embeddings
+
+            def apply_layer(block, carry, mask_arg, key_arg):
+                """Apply a single layer and return (new_carry, output_to_collect)"""
+                new_carry = block(carry, mask_arg, key=key_arg)
+                return new_carry, new_carry
+
+            # Use scan_via to apply each layer and collect outputs
+            hidden_states, stacked_layer_outputs = self.layers.scan_via(apply_layer)(hidden_states, mask, keys)
+
+            # Unbind the stacked outputs to get a list of hidden states
+            layer_outputs_list = hax.unbind(stacked_layer_outputs, self.config.Layers)
+            all_hidden_states.extend(layer_outputs_list)
+            result_hidden_states: Optional[Tuple[NamedArray, ...]] = tuple(all_hidden_states)
+        else:
+            # Use fold for efficient processing when we don't need intermediate states
+            hidden_states = self.layers.fold(hidden_states, mask, key=keys)
+            # Still provide the last layer output (before post_layernorm) for vision_feature_layer=-1
+            # This allows callers to use hidden_states[-1] without requiring output_hidden_states=True
+            result_hidden_states = (hidden_states,)
+
+        # Apply post-layer normalization
+        last_hidden_state = self.post_layernorm(hidden_states)
+
+        return SiglipVisionModelOutput(last_hidden_state=last_hidden_state, hidden_states=result_hidden_states)
+
+
+# =====================
+# SigLIP Vision Model (HF-compatible wrapper)
+# =====================
+
+
+class SiglipVisionModel(ModuleWithStateDictSerialization, ModelWithHfSerializationMixin[SiglipVisionConfig]):
+    """
+    SigLIP Vision Model with HuggingFace compatibility.
+
+    This is a wrapper around SiglipVisionTransformer that implements
+    the ModelWithHfSerializationMixin interface for checkpoint conversion.
+    """
+
+    vision_model: SiglipVisionTransformer
+
+    @property
+    def config(self) -> SiglipVisionConfig:
+        return self.vision_model.config
+
+    @property
+    def Vocab(self) -> Axis:
+        # Vision models don't have a vocab, but ModelWithHfSerializationMixin requires it
+        # We use a dummy axis for compatibility
+        return Axis(name="vocab", size=1)
+
+    def get_hf_config(self):
+        """Override to avoid requiring vocab_size for vision models."""
+        return self.config.to_hf_config()
+
+    @classmethod
+    def init(cls, Vocab: Axis, config: SiglipVisionConfig, *, key) -> "SiglipVisionModel":
+        """
+        Initialize SiglipVisionModel.
+
+        Args:
+            Vocab: Dummy vocab axis (not used for vision models, but required by interface)
+            config: SiglipVisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized SiglipVisionModel
+        """
+        vision_model = SiglipVisionTransformer.init(config, key=key)
+        return cls(vision_model=vision_model)
+
+    @named_call
+    def __call__(
+        self,
+        pixel_values: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        output_hidden_states: bool = False,
+        *,
+        key=None,
+    ) -> SiglipVisionModelOutput:
+        """
+        Forward pass through vision model.
+
+        Args:
+            pixel_values: Input images with shape (batch, channels, height, width)
+            mask: Optional attention mask
+            output_hidden_states: Whether to return hidden states from all layers
+            key: PRNGKey for dropout
+
+        Returns:
+            SiglipVisionModelOutput containing last_hidden_state and optionally hidden_states
+        """
+        return self.vision_model(pixel_values, mask=mask, output_hidden_states=output_hidden_states, key=key)
+
+    def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
+        """Map Levanter field names to HuggingFace state dict keys."""
+        return {}  # Keep vision_model prefix as-is (matches HF structure)
+
+    def from_state_dict(self, state_dict: Dict[str, jnp.ndarray], prefix: Optional[str] = None):
+        """Load from state dict."""
+        from haliax._src.state_dict import default_eqx_module_from_state_dict
+
+        # Use default loading
+        return default_eqx_module_from_state_dict(self, state_dict, prefix)
+
+
+__all__ = [
+    "SiglipVisionConfig",
+    "SiglipMLP",
+    "SiglipAttention",
+    "SiglipEncoderLayer",
+    "SiglipVisionEmbeddings",
+    "SiglipVisionTransformer",
+    "SiglipVisionModel",
+]
diff --git a/lib/levanter/src/levanter/models/siglip2.py b/lib/levanter/src/levanter/models/siglip2.py
new file mode 100644
index 0000000000..d9ad6abd79
--- /dev/null
+++ b/lib/levanter/src/levanter/models/siglip2.py
@@ -0,0 +1,1209 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple, Type
+
+import jax
+from levanter.models.vlm_model import VisionEncoderConfig
+import jax.image
+import equinox as eqx
+import jax.numpy as jnp
+
+import haliax as hax
+import haliax.nn as hnn
+from haliax import Axis, NamedArray
+from haliax.jax_utils import maybe_rng_split, named_call, shaped_rng_split
+from haliax.nn.scan import Stacked
+from haliax.state_dict import ModuleWithStateDictSerialization
+
+from levanter.compat.hf_checkpoints import HFCheckpointConverter, ModelWithHfSerializationMixin
+from levanter.layers.attention import AttentionBackend, AttentionMask, dot_product_attention
+from levanter.utils.activation import ActivationFunctionEnum
+from levanter.utils.logging import silence_transformer_nag
+
+logger = logging.getLogger(__name__)
+
+silence_transformer_nag()
+from transformers import PretrainedConfig as HfConfig  # noqa: E402
+from transformers import Siglip2VisionConfig as HfSiglip2VisionConfig  # noqa: E402
+
+
+@dataclass
+class Siglip2VisionModelOutput:
+    """
+    Output class for Siglip2 Vision Model, similar to HuggingFace's output format.
+
+    Args:
+        last_hidden_state: Final hidden states after post-layer normalization.
+            Shape: (batch, num_patches, embed)
+        hidden_states: Tuple of hidden states from each layer (including embeddings).
+            Each element has shape: (batch, num_patches, embed)
+            Only populated when output_hidden_states=True.
+    """
+
+    last_hidden_state: NamedArray
+    hidden_states: Optional[Tuple[NamedArray, ...]] = None
+
+
+@VisionEncoderConfig.register_subclass("siglip2")
+@dataclass(frozen=True)
+class Siglip2VisionConfig(VisionEncoderConfig["Siglip2VisionModel"]):
+    """
+    Configuration class for Siglip2 Vision Encoder (marin version).
+
+    This configuration follows the Levanter/marin patterns for model configs,
+    supporting HuggingFace checkpoint conversion and serialization.
+
+    Args:
+        hidden_size: Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size: Dimensionality of the "intermediate" (i.e., feed-forward) layer.
+        num_hidden_layers: Number of hidden layers in the Transformer encoder.
+        num_attention_heads: Number of attention heads for each attention layer.
+        num_channels: Number of channels in the input images.
+        num_patches: Maximum number of patches in the image (with aspect ratio preservation).
+        patch_size: The size (resolution) of each patch.
+        hidden_act: The non-linear activation function.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        attention_dropout: The dropout ratio for the attention probabilities.
+        initializer_range: The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        gradient_checkpointing: Whether to use gradient checkpointing to save memory.
+        use_flash_attention: Whether to use flash attention.
+        attn_backend: Attention backend to use (if None, uses default).
+        flash_attention_block_size: Block size for flash attention.
+        inference: Whether to run in inference mode (disables dropout).
+    """
+
+    hidden_size: int = 768
+    intermediate_size: int = 3072
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 12
+    num_channels: int = 3
+    num_patches: int = 256
+    patch_size: int = 16
+    hidden_act: ActivationFunctionEnum = ActivationFunctionEnum.gelu_new
+    layer_norm_eps: float = 1e-6
+    attention_dropout: float = 0.0
+    initializer_range: float = 0.02
+    gradient_checkpointing: bool = True
+
+    # Attention-related config
+    use_flash_attention: bool = False
+    attn_backend: Optional[AttentionBackend] = None
+    flash_attention_block_size: Optional[int] = None
+    inference: bool = True  # Whether to run in inference mode (disables dropout)
+
+    # Reference checkpoint for loading pretrained models
+    reference_checkpoint: Optional[str] = None
+
+    @property
+    def model_type(self) -> Type:
+        """Return the model class type."""
+        return Siglip2VisionModel
+
+    def hf_checkpoint_converter(
+        self, ref_checkpoint: Optional[str] = None
+    ) -> HFCheckpointConverter["Siglip2VisionConfig"]:  # type: ignore
+        """Create HuggingFace checkpoint converter for this config."""
+        # Vision-only models don't have a tokenizer, but HFCheckpointConverter requires one
+        # Use gpt2 tokenizer as a placeholder since it's always available
+        return HFCheckpointConverter(
+            self.__class__,
+            reference_checkpoint=self.reference_checkpoint if ref_checkpoint is None else ref_checkpoint,
+            trust_remote_code=False,
+            tokenizer="gpt2",  # Dummy tokenizer for vision-only model
+            HfConfigClass=HfSiglip2VisionConfig,
+        )
+
+    @classmethod
+    def from_hf_config(cls, hf_config: HfConfig) -> "Siglip2VisionConfig":
+        """Convert from HuggingFace config to Levanter config."""
+        # Extract activation function, handle both string and enum
+        hidden_act = hf_config.hidden_act
+        if isinstance(hidden_act, str):
+            # Map HF activation names to our enum
+            # Note: gelu_pytorch_tanh in HF maps to gelu_new in Levanter (approximate GELU)
+            if hidden_act == "gelu_pytorch_tanh":
+                activation_fn = ActivationFunctionEnum.gelu_new
+            elif hidden_act == "gelu":
+                activation_fn = ActivationFunctionEnum.gelu
+            elif hidden_act == "gelu_new":
+                activation_fn = ActivationFunctionEnum.gelu_new
+            elif hidden_act == "relu":
+                activation_fn = ActivationFunctionEnum.relu
+            elif hidden_act == "silu" or hidden_act == "swish":
+                activation_fn = ActivationFunctionEnum.silu
+            elif hidden_act == "quick_gelu":
+                activation_fn = ActivationFunctionEnum.quick_gelu
+            else:
+                logger.warning(f"Unknown activation function '{hidden_act}', defaulting to gelu_new")
+                activation_fn = ActivationFunctionEnum.gelu_new
+        else:
+            logger.warning(f"Unexpected activation function type {type(hidden_act)}, defaulting to gelu_new")
+            activation_fn = ActivationFunctionEnum.gelu_new
+
+        # Calculate num_patches if not provided
+        # num_patches = (image_size / patch_size) ^ 2
+        if hasattr(hf_config, "num_patches"):
+            num_patches = hf_config.num_patches
+        else:
+            # Calculate from image_size and patch_size
+            grid_size = hf_config.image_size // hf_config.patch_size
+            num_patches = grid_size * grid_size
+
+        return cls(
+            hidden_size=hf_config.hidden_size,
+            intermediate_size=hf_config.intermediate_size,
+            num_hidden_layers=hf_config.num_hidden_layers,
+            num_attention_heads=hf_config.num_attention_heads,
+            num_channels=hf_config.num_channels,
+            num_patches=num_patches,
+            patch_size=hf_config.patch_size,
+            hidden_act=activation_fn,
+            layer_norm_eps=hf_config.layer_norm_eps,
+            attention_dropout=hf_config.attention_dropout,
+        )
+
+    def to_hf_config(
+        self, vocab_size: Optional[int] = None, config_overrides: Optional[Dict] = None
+    ) -> HfSiglip2VisionConfig:
+        """Convert from Levanter config to HuggingFace config.
+
+        Args:
+            vocab_size: Ignored for vision models (present for interface compatibility)
+            config_overrides: Optional config overrides
+        """
+        # vocab_size is ignored for vision models
+        if config_overrides is None:
+            config_overrides = {}
+
+        # Map activation function back to HF format
+        # gelu_new in Levanter maps back to gelu_pytorch_tanh in HF (for Siglip2 compatibility)
+        if isinstance(self.hidden_act, ActivationFunctionEnum):
+            if self.hidden_act == ActivationFunctionEnum.gelu_new:
+                hf_hidden_act = "gelu_pytorch_tanh"
+            else:
+                hf_hidden_act = self.hidden_act.value
+        else:
+            hf_hidden_act = self.hidden_act
+
+        # Calculate image_size from num_patches and patch_size
+        # This is needed for compatibility with LlavaOnevision which expects image_size
+        grid_size = int(self.num_patches**0.5)
+        image_size = grid_size * self.patch_size
+
+        hf_config = HfSiglip2VisionConfig(
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_channels=self.num_channels,
+            num_patches=self.num_patches,
+            patch_size=self.patch_size,
+            hidden_act=hf_hidden_act,
+            layer_norm_eps=self.layer_norm_eps,
+            attention_dropout=self.attention_dropout,
+            **config_overrides,
+        )
+
+        # Add image_size as a manual attribute for LlavaOnevision compatibility
+        # HfSiglip2VisionConfig doesn't have image_size in __init__, but we can set it manually
+        hf_config.image_size = image_size
+
+        return hf_config
+
+    # Axis definitions following marin/Levanter patterns
+    @property
+    def Embed(self) -> Axis:
+        """Embedding dimension axis."""
+        return Axis(name="embed", size=self.hidden_size)
+
+    @property
+    def Mlp(self) -> Axis:
+        """MLP intermediate dimension axis."""
+        return Axis(name="mlp", size=self.intermediate_size)
+
+    @property
+    def Heads(self) -> Axis:
+        """Number of attention heads axis."""
+        return Axis(name="heads", size=self.num_attention_heads)
+
+    @property
+    def HeadSize(self) -> Axis:
+        """Size of each attention head axis."""
+        return Axis(name="head_size", size=self.hidden_size // self.num_attention_heads)
+
+    @property
+    def Layers(self) -> Axis:
+        """Number of transformer layers axis."""
+        return Axis(name="layers", size=self.num_hidden_layers)
+
+    @property
+    def Channels(self) -> Axis:
+        """Number of image channels axis."""
+        return Axis(name="channels", size=self.num_channels)
+
+    @property
+    def PatchSize(self) -> Axis:
+        """Patch size axis."""
+        return Axis(name="patch_size", size=self.patch_size)
+
+    @property
+    def NumPatches(self) -> Axis:
+        """Maximum number of patches axis."""
+        return Axis(name="num_patches", size=self.num_patches)
+
+
+# =====================
+# Siglip2 MLP
+# =====================
+
+
+class Siglip2MLP(eqx.Module):
+    """
+    MLP module for Siglip2 Vision Transformer.
+
+    Implements a two-layer feedforward network with activation function in between.
+    """
+
+    fc1: hnn.Linear  # projection from Embed to Mlp (intermediate)
+    fc2: hnn.Linear  # projection from Mlp to Embed
+    act: Callable = eqx.field(static=True)
+
+    @staticmethod
+    def init(Embed: Axis, Mlp: Axis, activation_fn: ActivationFunctionEnum, *, key) -> "Siglip2MLP":
+        """
+        Initialize Siglip2MLP.
+
+        Args:
+            Embed: Embedding dimension axis
+            Mlp: MLP intermediate dimension axis
+            activation_fn: Activation function enum
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized Siglip2MLP module
+        """
+        k_fc1, k_fc2 = maybe_rng_split(key, 2)
+
+        # In Siglip2, fc1 goes from hidden_size to intermediate_size
+        fc1 = hnn.Linear.init(In=Embed, Out=Mlp, key=k_fc1, use_bias=True, out_first=True)
+        # fc2 goes from intermediate_size back to hidden_size
+        fc2 = hnn.Linear.init(In=Mlp, Out=Embed, key=k_fc2, use_bias=True, out_first=True)
+
+        # Convert activation function enum to callable
+        activation_fn_callable = (
+            activation_fn.to_fn() if isinstance(activation_fn, ActivationFunctionEnum) else activation_fn
+        )
+
+        return Siglip2MLP(fc1, fc2, activation_fn_callable)
+
+    @named_call
+    def __call__(self, x: NamedArray, *, key=None) -> NamedArray:
+        """
+        Forward pass through MLP.
+
+        Args:
+            x: Input tensor with Embed axis
+            key: Optional PRNGKey for dropout (not used in Siglip2)
+
+        Returns:
+            Output tensor with Embed axis
+        """
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+# =====================
+# Siglip2 Attention
+# =====================
+
+
+class Siglip2Attention(eqx.Module):
+    """
+    Multi-headed attention module for Siglip2.
+
+    Implements standard multi-head self-attention with separate Q, K, V projections
+    and an output projection.
+    """
+
+    config: Siglip2VisionConfig = eqx.field(static=True)
+    q_proj: hnn.Linear  # Query projection from Embed to (Heads, HeadSize)
+    k_proj: hnn.Linear  # Key projection from Embed to (Heads, HeadSize)
+    v_proj: hnn.Linear  # Value projection from Embed to (Heads, HeadSize)
+    out_proj: hnn.Linear  # Output projection from (Heads, HeadSize) to Embed
+    inference: bool  # Whether to run in inference mode (disables dropout)
+
+    @staticmethod
+    def init(config: Siglip2VisionConfig, *, key) -> "Siglip2Attention":
+        """
+        Initialize Siglip2Attention.
+
+        Args:
+            config: Siglip2VisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized Siglip2Attention module
+        """
+        k_q, k_k, k_v, k_out = maybe_rng_split(key, 4)
+
+        Embed = config.Embed
+        Heads = config.Heads
+        HeadSize = config.HeadSize
+
+        # Initialize projection layers
+        # All projections use bias in Siglip2
+        q_proj = hnn.Linear.init(In=Embed, Out=(Heads, HeadSize), key=k_q, use_bias=True, out_first=True)
+        k_proj = hnn.Linear.init(In=Embed, Out=(Heads, HeadSize), key=k_k, use_bias=True, out_first=True)
+        v_proj = hnn.Linear.init(In=Embed, Out=(Heads, HeadSize), key=k_v, use_bias=True, out_first=True)
+        out_proj = hnn.Linear.init(In=(Heads, HeadSize), Out=Embed, key=k_out, use_bias=True, out_first=True)
+
+        return Siglip2Attention(config, q_proj, k_proj, v_proj, out_proj, config.inference)
+
+    @named_call
+    def __call__(
+        self,
+        x: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        *,
+        key=None,
+    ) -> NamedArray:
+        """
+        Forward pass through attention.
+
+        Args:
+            x: Input tensor with shape (..., position, embed)
+            mask: Optional attention mask
+            key: PRNGKey for dropout
+
+        Returns:
+            Output tensor with shape (..., position, embed)
+        """
+        k_drop = maybe_rng_split(key, 1)[0] if key is not None else None
+
+        # Find the sequence axis (num_patches or position)
+        seq_axis_name = None
+        embed_axis = self.config.Embed
+        for ax in x.axes:
+            if ax.name in ("num_patches", "position"):
+                seq_axis_name = ax.name
+                break
+        if seq_axis_name is None:
+            for ax in x.axes:
+                if ax != embed_axis and ax.name not in ("batch", "Batch"):
+                    seq_axis_name = ax.name
+                    break
+        if seq_axis_name is None:
+            raise ValueError(f"Could not find sequence axis in input {x.axes}")
+
+        # Project to Q, K, V
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        # Compute padding for Splash Attention (requires sequence length to be multiple of 128)
+        seq_axis = q.resolve_axis(seq_axis_name)
+        orig_seq_len = seq_axis.size
+        SPLASH_BLOCK_SIZE = 128
+        pad_len = (SPLASH_BLOCK_SIZE - (orig_seq_len % SPLASH_BLOCK_SIZE)) % SPLASH_BLOCK_SIZE
+
+        if pad_len > 0:
+            # Pad q, k, v with zeros along sequence dimension
+            q = hax.pad(q, {seq_axis_name: (0, pad_len)})
+            k = hax.pad(k, {seq_axis_name: (0, pad_len)})
+            v = hax.pad(v, {seq_axis_name: (0, pad_len)})
+
+            # Create padded axes
+            padded_seq_len = orig_seq_len + pad_len
+            PaddedSeqAxis = Axis(seq_axis_name, padded_seq_len)
+            PaddedKeyAxis = Axis("key_position", padded_seq_len)
+
+            # Create attention mask to ignore padded positions using numpy for static mask
+            # This allows Splash Attention to use NumpyMask which requires static arrays
+            import numpy as np
+
+            q_indices = np.arange(padded_seq_len)
+            k_indices = np.arange(padded_seq_len)
+            # Create 2D mask: valid positions are where both q and k indices < orig_seq_len
+            pad_mask_np = (q_indices[:, None] < orig_seq_len) & (k_indices[None, :] < orig_seq_len)
+            # Wrap as NamedArray for AttentionMask
+            pad_mask = hax.named(pad_mask_np, (PaddedSeqAxis, PaddedKeyAxis))
+
+            # Combine with existing mask if present
+            if mask is not None:
+                combined_mask = AttentionMask.explicit(pad_mask) & mask
+            else:
+                combined_mask = AttentionMask.explicit(pad_mask)
+
+            # Rename k and v's sequence axis to key_position
+            k = k.rename({seq_axis_name: "key_position"})
+            v = v.rename({seq_axis_name: "key_position"})
+        else:
+            # No padding needed
+            combined_mask = mask
+            k = k.rename({seq_axis_name: "key_position"})
+            v = v.rename({seq_axis_name: "key_position"})
+
+        # Compute attention
+        attn_output = dot_product_attention(
+            seq_axis_name,
+            "key_position",
+            "head_size",
+            q,
+            k,
+            v,
+            mask=combined_mask,
+            inference=self.inference,
+            use_flash=self.config.use_flash_attention,
+            attn_backend=self.config.attn_backend,
+            flash_block_size=self.config.flash_attention_block_size,
+            dropout=self.config.attention_dropout,
+            prng=k_drop,
+            attention_dtype=x.dtype,
+        )
+
+        # Remove padding from output if we added it
+        if pad_len > 0:
+            attn_output = attn_output[seq_axis_name, :orig_seq_len]
+
+        return self.out_proj(attn_output.astype(x.dtype))
+
+
+# =====================
+# Siglip2 Encoder Layer
+# =====================
+
+
+class Siglip2EncoderLayer(eqx.Module):
+    """
+    Siglip2 Encoder Layer.
+
+    Implements a transformer encoder layer with:
+    - Pre-LayerNorm architecture
+    - Self-attention with residual connection
+    - MLP with residual connection
+    """
+
+    config: Siglip2VisionConfig = eqx.field(static=True)
+    layer_norm1: hnn.LayerNorm  # Pre-attention layer norm
+    self_attn: Siglip2Attention  # Self-attention module
+    layer_norm2: hnn.LayerNorm  # Pre-MLP layer norm
+    mlp: Siglip2MLP  # MLP module
+
+    @staticmethod
+    def init(config: Siglip2VisionConfig, *, key) -> "Siglip2EncoderLayer":
+        """
+        Initialize Siglip2EncoderLayer.
+
+        Args:
+            config: Siglip2VisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized Siglip2EncoderLayer module
+        """
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+
+        # Initialize layer norms
+        layer_norm1 = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+        layer_norm2 = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+
+        # Initialize attention and MLP
+        self_attn = Siglip2Attention.init(config, key=k_attn)
+        mlp = Siglip2MLP.init(config.Embed, config.Mlp, config.hidden_act, key=k_mlp)
+
+        return Siglip2EncoderLayer(config, layer_norm1, self_attn, layer_norm2, mlp)
+
+    @named_call
+    def __call__(
+        self,
+        x: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        *,
+        key=None,
+    ) -> NamedArray:
+        """
+        Forward pass through encoder layer.
+
+        Args:
+            x: Input tensor with shape (..., position, embed)
+            mask: Optional attention mask
+            key: PRNGKey for dropout
+
+        Returns:
+            Output tensor with shape (..., position, embed)
+        """
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+
+        # Self-attention block with pre-norm and residual
+        residual = x
+        x_norm = self.layer_norm1(x)
+        attn_output = self.self_attn(x_norm, mask=mask, key=k_attn)
+        x = residual + attn_output
+
+        # MLP block with pre-norm and residual
+        residual = x
+        x_norm = self.layer_norm2(x)
+        mlp_output = self.mlp(x_norm, key=k_mlp)
+        x = residual + mlp_output
+
+        return x
+
+
+# =====================
+# Siglip2 Vision Embeddings
+# =====================
+
+
+class Siglip2VisionEmbeddings(eqx.Module):
+    """
+    Vision embeddings for Siglip2.
+
+    Converts patchified images to embeddings and adds position embeddings.
+    Unlike traditional ViT, Siglip2 uses flexible aspect ratio handling.
+    """
+
+    config: Siglip2VisionConfig = eqx.field(static=True)
+    patch_embedding: hnn.Linear
+    position_embedding: hnn.Embedding
+
+    @staticmethod
+    def init(config: Siglip2VisionConfig, *, key) -> "Siglip2VisionEmbeddings":
+        """
+        Initialize Siglip2VisionEmbeddings.
+
+        Args:
+            config: Siglip2VisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized Siglip2VisionEmbeddings module
+        """
+        k_patch, k_pos = maybe_rng_split(key, 2)
+
+        # Patch embedding: linear projection from flattened patches to embed_dim
+        # Input: num_channels * patch_size * patch_size
+        # Output: hidden_size
+        patch_input_dim = config.num_channels * config.patch_size * config.patch_size
+        PatchInput = Axis(name="patch_input", size=patch_input_dim)
+
+        patch_embedding = hnn.Linear.init(
+            In=PatchInput,
+            Out=config.Embed,
+            key=k_patch,
+            use_bias=True,
+            out_first=True,
+        )
+
+        # Position embedding: learnable embeddings for each patch position
+        position_embedding = hnn.Embedding.init(
+            config.NumPatches,
+            config.Embed,
+            key=k_pos,
+        )
+
+        return Siglip2VisionEmbeddings(config, patch_embedding, position_embedding)
+
+    @named_call
+    def __call__(self, pixel_values: NamedArray, spatial_shapes=None, *, key=None) -> NamedArray:
+        """
+        Forward pass through vision embeddings.
+
+        Args:
+            pixel_values: Patchified pixel values with shape (..., num_patches, patch_input_dim)
+                where patch_input_dim = num_channels * patch_size * patch_size
+            spatial_shapes: Optional array of shape (batch, 2) containing [height, width] in patches
+                for each image. If provided, position embeddings will be interpolated to match.
+            key: Optional PRNGKey (unused, kept for API compatibility)
+
+        Returns:
+            Embeddings with position information added
+        """
+        # Apply patch embeddings to patchified pixels
+        # Shape: (..., num_patches, patch_input_dim) -> (..., num_patches, hidden_size)
+        patch_embeds = self.patch_embedding(pixel_values)
+
+        # Get position embeddings
+        num_patches_axis = pixel_values.resolve_axis("num_patches")
+
+        if spatial_shapes is not None:
+            # Interpolate position embeddings to match spatial_shapes
+            # This is needed for flexible aspect ratio support
+
+            # Get the pretrained position embeddings (assuming square grid)
+            num_positions = self.config.NumPatches.size
+            grid_size = int(num_positions**0.5)
+
+            # Get all position embeddings and reshape to 2D grid
+            # Shape: (num_positions, embed_dim) -> (grid_size, grid_size, embed_dim)
+            all_pos_ids = hax.arange(self.config.NumPatches)
+            all_pos_embeds = self.position_embedding(all_pos_ids)  # (num_patches, embed)
+            pos_embeds_2d = all_pos_embeds.array.reshape(grid_size, grid_size, -1)
+
+            # Get target height and width from pixel_values shape (JIT-safe)
+            # num_patches_axis.size is static at trace time
+            # For square grids: target_h = target_w = sqrt(num_patches)
+            # For non-square: use spatial_shapes if it contains Python ints, otherwise infer from num_patches
+            expected_num_patches = num_patches_axis.size
+
+            # Check if spatial_shapes contains concrete Python values or is traced
+            # If spatial_shapes is a numpy array or contains Python ints, use it directly
+            # Otherwise, infer from pixel_values shape (assumes square grid)
+            try:
+                # Try to get concrete values - works for numpy arrays and Python values
+                target_h = int(spatial_shapes[0, 0])
+                target_w = int(spatial_shapes[0, 1])
+            except (TypeError, jax.errors.ConcretizationTypeError):
+                # spatial_shapes is traced, infer from pixel_values (assumes square)
+                target_h = target_w = int(expected_num_patches**0.5)
+
+            # Use JAX's resize function to interpolate
+            # Need to permute to (embed, height, width) for resize, then back
+            pos_embeds_2d = jnp.transpose(pos_embeds_2d, (2, 0, 1))  # (embed, h, w)
+            pos_embeds_resized = jax.image.resize(
+                pos_embeds_2d,
+                shape=(pos_embeds_2d.shape[0], target_h, target_w),
+                method="linear",  # 'linear' (bilinear for 2D) is the closest to PyTorch's bilinear
+            )
+            # Reshape back to (num_patches, embed)
+            pos_embeds_resized = jnp.transpose(pos_embeds_resized, (1, 2, 0))  # (h, w, embed)
+            pos_embeds_flat = pos_embeds_resized.reshape(-1, pos_embeds_resized.shape[-1])
+
+            # The interpolated position embeddings may have different number of patches than pixel_values
+            # (e.g., 14*18=252 vs 256 if pixel_values is padded)
+            # We need to broadcast/pad the position embeddings to match
+            actual_num_patches_interp = target_h * target_w
+
+            if actual_num_patches_interp < expected_num_patches:
+                # Pad by repeating the first embedding value (matching HF behavior)
+                # HF does: resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+                padding = expected_num_patches - actual_num_patches_interp
+                first_embedding = pos_embeds_flat[0:1]  # Shape: (1, embed_dim)
+                repeated_padding = jnp.repeat(first_embedding, padding, axis=0)  # Shape: (padding, embed_dim)
+                pos_embeds_flat = jnp.concatenate([pos_embeds_flat, repeated_padding], axis=0)
+            elif actual_num_patches_interp > expected_num_patches:
+                raise ValueError(
+                    f"Actual number of patches {actual_num_patches_interp} exceeds expected {expected_num_patches}"
+                )
+
+            # Create NamedArray with correct axis
+            pos_embeds = hax.named(pos_embeds_flat, (num_patches_axis, self.config.Embed))
+        else:
+            # Standard position embeddings (square grid)
+            position_ids = hax.arange(num_patches_axis)
+            pos_embeds = self.position_embedding(position_ids)
+
+        # Add position embeddings to patch embeddings
+        # Broadcasting will handle batch dimensions
+        embeddings = patch_embeds + pos_embeds
+
+        return embeddings
+
+
+# =====================
+# Siglip2 Vision Transformer
+# =====================
+
+
+class Siglip2VisionTransformer(ModuleWithStateDictSerialization):
+    """
+    Siglip2 Vision Transformer.
+
+    Complete vision encoder consisting of:
+    - Vision embeddings (patch + position)
+    - Stack of encoder layers
+    - Post-layer normalization
+    """
+
+    config: Siglip2VisionConfig = eqx.field(static=True)
+    embeddings: Siglip2VisionEmbeddings
+    layers: Stacked[Siglip2EncoderLayer]
+    post_layernorm: hnn.LayerNorm
+
+    def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
+        """Map Levanter field names to HuggingFace state dict keys."""
+        return {"layers": "encoder.layers"}  # HF uses encoder.layers instead of layers
+
+    @staticmethod
+    def init(config: Siglip2VisionConfig, *, key) -> "Siglip2VisionTransformer":
+        """
+        Initialize Siglip2VisionTransformer.
+
+        Args:
+            config: Siglip2VisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized Siglip2VisionTransformer module
+        """
+        k_embed, k_layers = maybe_rng_split(key, 2)
+
+        # Initialize embeddings
+        embeddings = Siglip2VisionEmbeddings.init(config, key=k_embed)
+
+        # Initialize stacked encoder layers
+        layers = Stacked.init(
+            config.Layers,
+            Siglip2EncoderLayer,
+            gradient_checkpointing=config.gradient_checkpointing,
+        )(config, key=shaped_rng_split(k_layers, config.num_hidden_layers))
+
+        # Post-encoder layer norm
+        post_layernorm = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+
+        return Siglip2VisionTransformer(config, embeddings, layers, post_layernorm)
+
+    @named_call
+    def __call__(
+        self,
+        pixel_values: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        spatial_shapes=None,
+        output_hidden_states: bool = False,
+        *,
+        key=None,
+    ) -> Siglip2VisionModelOutput:
+        """
+        Forward pass through vision transformer.
+
+        Args:
+            pixel_values: Patchified pixel values with shape (..., num_patches, patch_input_dim)
+            mask: Optional attention mask
+            spatial_shapes: Optional array of shape (batch, 2) containing [height, width] in patches
+            output_hidden_states: Whether to return hidden states from all layers.
+                If True, returns all layer hidden states.
+                If False, only returns the last hidden state (more efficient).
+            key: PRNGKey for dropout
+
+        Returns:
+            Siglip2VisionModelOutput containing:
+                - last_hidden_state: Final encoded representations after post_layernorm
+                - hidden_states: Tuple of hidden states. When output_hidden_states=True, contains all layers.
+                    When output_hidden_states=False, contains only the last layer output (before post_layernorm)
+                    to support vision_feature_layer=-1 without collecting all intermediate states.
+        """
+        k_embed, k_layers = maybe_rng_split(key, 2)
+
+        # Get embeddings with spatial_shapes support
+        hidden_states = self.embeddings(pixel_values, spatial_shapes=spatial_shapes, key=k_embed)
+
+        # Pass through encoder layers
+        keys = maybe_rng_split(k_layers, self.config.num_hidden_layers) if k_layers is not None else None
+
+        if output_hidden_states:
+            # Use scan_via to collect hidden states from each layer
+            all_hidden_states = [hidden_states]  # Start with embeddings
+
+            def apply_layer(block, carry, mask_arg, key_arg):
+                """Apply a single layer and return (new_carry, output_to_collect)"""
+                new_carry = block(carry, mask_arg, key=key_arg)
+                return new_carry, new_carry
+
+            # Use scan_via to apply each layer and collect outputs
+            hidden_states, stacked_layer_outputs = self.layers.scan_via(apply_layer)(hidden_states, mask, keys)
+
+            # Unbind the stacked outputs to get a list of hidden states
+            layer_outputs_list = hax.unbind(stacked_layer_outputs, self.config.Layers)
+            all_hidden_states.extend(layer_outputs_list)
+            result_hidden_states: Optional[Tuple[NamedArray, ...]] = tuple(all_hidden_states)
+        else:
+            # Use fold for efficient processing when we don't need intermediate states
+            hidden_states = self.layers.fold(hidden_states, mask, key=keys)
+            # Still provide the last layer output (before post_layernorm) for vision_feature_layer=-1
+            # This allows callers to use hidden_states[-1] without requiring output_hidden_states=True
+            result_hidden_states = (hidden_states,)
+
+        # Apply post-layer normalization
+        last_hidden_state = self.post_layernorm(hidden_states)
+
+        return Siglip2VisionModelOutput(last_hidden_state=last_hidden_state, hidden_states=result_hidden_states)
+
+
+# =====================
+# Siglip2 Multihead Attention Pooling Head
+# =====================
+
+
+class Siglip2MultiheadAttentionPoolingHead(ModuleWithStateDictSerialization):
+    """
+    Multihead attention pooling head for Siglip2.
+
+    Uses a learnable probe to attend to encoder outputs and produce a pooled representation.
+    The output is a single vector per batch element (not a sequence).
+    """
+
+    config: Siglip2VisionConfig = eqx.field(static=True)
+    probe: NamedArray  # Learnable query: (1, embed)
+    q_proj: hnn.Linear  # Query projection for probe
+    k_proj: hnn.Linear  # Key projection for hidden states
+    v_proj: hnn.Linear  # Value projection for hidden states
+    out_proj: hnn.Linear  # Output projection
+    layernorm: hnn.LayerNorm
+    mlp: Siglip2MLP
+    inference: bool  # Whether to run in inference mode (disables dropout)
+
+    @staticmethod
+    def init(config: Siglip2VisionConfig, *, key) -> "Siglip2MultiheadAttentionPoolingHead":
+        """
+        Initialize Siglip2MultiheadAttentionPoolingHead.
+
+        Args:
+            config: Siglip2VisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized head module
+        """
+        k_probe, k_q, k_k, k_v, k_out, k_mlp = maybe_rng_split(key, 6)
+
+        ProbeSeq = Axis("probe_seq", 1)
+
+        # Learnable probe: (1, hidden_size)
+        probe = hax.random.normal(k_probe, (ProbeSeq, config.Embed)) * config.initializer_range
+
+        # Attention projections (Q, K, V, out)
+        # Q projection for probe
+        q_proj = hnn.Linear.init(
+            In=config.Embed,
+            Out=(config.Heads, config.HeadSize),
+            key=k_q,
+            use_bias=True,
+            out_first=True,
+        )
+        # K projection for hidden states
+        k_proj = hnn.Linear.init(
+            In=config.Embed,
+            Out=(config.Heads, config.HeadSize),
+            key=k_k,
+            use_bias=True,
+            out_first=True,
+        )
+        # V projection for hidden states
+        v_proj = hnn.Linear.init(
+            In=config.Embed,
+            Out=(config.Heads, config.HeadSize),
+            key=k_v,
+            use_bias=True,
+            out_first=True,
+        )
+        # Output projection
+        out_proj = hnn.Linear.init(
+            In=(config.Heads, config.HeadSize),
+            Out=config.Embed,
+            key=k_out,
+            use_bias=True,
+            out_first=True,
+        )
+
+        # Layer norm
+        layernorm = hnn.LayerNorm.init(config.Embed, eps=config.layer_norm_eps, use_bias=True)
+
+        # MLP
+        mlp = Siglip2MLP.init(config.Embed, config.Mlp, config.hidden_act, key=k_mlp)
+
+        return Siglip2MultiheadAttentionPoolingHead(
+            config=config,
+            probe=probe,
+            q_proj=q_proj,
+            k_proj=k_proj,
+            v_proj=v_proj,
+            out_proj=out_proj,
+            layernorm=layernorm,
+            mlp=mlp,
+            inference=config.inference,
+        )
+
+    @named_call
+    def __call__(
+        self,
+        hidden_states: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        *,
+        key=None,
+    ) -> NamedArray:
+        """
+        Forward pass through attention pooling head.
+
+        Args:
+            hidden_states: Encoder output with shape (..., num_patches, embed)
+            mask: Optional attention mask
+            key: PRNGKey for dropout
+
+        Returns:
+            Pooled representation with shape (..., embed)
+        """
+        k_drop = maybe_rng_split(key, 1)[0] if key is not None else None
+
+        # Project probe to Q
+        q = self.q_proj(self.probe)  # (probe_seq, heads, head_size)
+
+        # Project hidden states to K, V
+        k = self.k_proj(hidden_states)  # (..., num_patches, heads, head_size)
+        v = self.v_proj(hidden_states)  # (..., num_patches, heads, head_size)
+
+        # Broadcast q to match batch dimensions of k and v
+        batch_axes = [ax for ax in k.axes if ax.name not in ["num_patches", "heads", "head_size"]]
+        for ax in batch_axes:
+            q = hax.broadcast_to(q, (ax,) + q.axes)
+
+        # Rename for attention
+        k = k.rename({"num_patches": "key_position"})
+        v = v.rename({"num_patches": "key_position"})
+
+        # Cross-attention: probe attends to hidden states
+        attn_output = dot_product_attention(
+            "probe_seq",
+            "key_position",
+            "head_size",
+            q,
+            k,
+            v,
+            mask=mask,
+            inference=self.inference,
+            use_flash=self.config.use_flash_attention,
+            flash_block_size=self.config.flash_attention_block_size,
+            dropout=self.config.attention_dropout,
+            prng=k_drop,
+        )
+
+        # Project back to embed dimension
+        attn_output = self.out_proj(attn_output.astype(hidden_states.dtype))  # (..., probe_seq, embed)
+
+        # Residual connection with probe (broadcast probe to batch dims)
+        hidden_states = self.probe + attn_output
+
+        # Squeeze probe_seq dimension to get (..., embed)
+        ProbeSeq = hidden_states.resolve_axis("probe_seq")
+        hidden_states = hidden_states[ProbeSeq, 0]  # Remove probe_seq dim
+
+        # Layer norm + MLP with residual
+        residual = hidden_states
+        hidden_states = self.layernorm(hidden_states)
+        hidden_states = residual + self.mlp(hidden_states)
+
+        return hidden_states
+
+    def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
+        """Map Levanter field names to HuggingFace state dict keys."""
+        return {
+            "out_proj": "attention.out_proj",
+            "layernorm": "layernorm",
+            "mlp": "mlp",
+        }
+
+    def to_state_dict(self, prefix: Optional[str] = None) -> Dict[str, jnp.ndarray]:
+        """Convert to HuggingFace state dict format with combined in_proj."""
+        from haliax.state_dict import to_state_dict as eqx_to_state_dict, with_prefix
+
+        state_dict: Dict[str, jnp.ndarray] = {}
+
+        # Probe
+        state_dict[with_prefix(prefix, "probe")] = self.probe.array
+
+        # Combine Q, K, V projections into in_proj
+        # HF shape: (3 * hidden_size, hidden_size)
+        q_weight = self.q_proj.weight.array  # (heads, head_size, embed)
+        k_weight = self.k_proj.weight.array
+        v_weight = self.v_proj.weight.array
+
+        # Reshape to (hidden_size, embed) and stack
+        hidden_size = q_weight.shape[0] * q_weight.shape[1]
+        embed_size = q_weight.shape[2]
+
+        q_flat = q_weight.reshape(hidden_size, embed_size)
+        k_flat = k_weight.reshape(hidden_size, embed_size)
+        v_flat = v_weight.reshape(hidden_size, embed_size)
+
+        in_proj_weight = jnp.concatenate([q_flat, k_flat, v_flat], axis=0)
+        state_dict[with_prefix(prefix, "attention.in_proj_weight")] = in_proj_weight
+
+        # Combine biases
+        if self.q_proj.bias is not None:
+            q_bias = self.q_proj.bias.array.reshape(-1)
+            k_bias = self.k_proj.bias.array.reshape(-1)
+            v_bias = self.v_proj.bias.array.reshape(-1)
+            in_proj_bias = jnp.concatenate([q_bias, k_bias, v_bias], axis=0)
+            state_dict[with_prefix(prefix, "attention.in_proj_bias")] = in_proj_bias
+
+        # Output projection
+        out_dict = eqx_to_state_dict(self.out_proj, with_prefix(prefix, "attention.out_proj"))
+        state_dict.update(out_dict)
+
+        # Layer norm
+        ln_dict = eqx_to_state_dict(self.layernorm, with_prefix(prefix, "layernorm"))
+        state_dict.update(ln_dict)
+
+        # MLP
+        mlp_dict = eqx_to_state_dict(self.mlp, with_prefix(prefix, "mlp"))
+        state_dict.update(mlp_dict)
+
+        return state_dict
+
+    def from_state_dict(self, state_dict: Dict[str, jnp.ndarray], prefix: Optional[str] = None):
+        """Load from HuggingFace state dict format with combined in_proj."""
+        from haliax.state_dict import with_prefix, from_state_dict
+        import dataclasses
+
+        # Load probe
+        probe_key = with_prefix(prefix, "probe")
+        if probe_key in state_dict:
+            probe_array = state_dict[probe_key]
+            # HF shape: (1, 1, hidden_size) -> we want (probe_seq=1, embed)
+            if probe_array.ndim == 3:
+                probe_array = probe_array.squeeze(0)  # Remove batch dim
+            probe = hax.named(probe_array, self.probe.axes)
+        else:
+            probe = self.probe
+
+        # Split in_proj into Q, K, V
+        in_proj_weight_key = with_prefix(prefix, "attention.in_proj_weight")
+        in_proj_bias_key = with_prefix(prefix, "attention.in_proj_bias")
+
+        if in_proj_weight_key in state_dict:
+            in_proj_weight = state_dict[in_proj_weight_key]  # (3 * hidden_size, hidden_size)
+
+            # Split into Q, K, V
+            q_weight, k_weight, v_weight = jnp.split(in_proj_weight, 3, axis=0)
+
+            # The weights are already in the flattened format (hidden_size, embed_size)
+            # which matches our expected axes (__OUT__, __IN__) after flattening
+            # No need to reshape since the template is already flattened at this point
+
+            q_proj_weight = hax.named(q_weight, self.q_proj.weight.axes)
+            k_proj_weight = hax.named(k_weight, self.k_proj.weight.axes)
+            v_proj_weight = hax.named(v_weight, self.v_proj.weight.axes)
+        else:
+            q_proj_weight = self.q_proj.weight
+            k_proj_weight = self.k_proj.weight
+            v_proj_weight = self.v_proj.weight
+
+        # Handle biases
+        if in_proj_bias_key in state_dict:
+            in_proj_bias = state_dict[in_proj_bias_key]  # (3 * hidden_size,)
+            q_bias, k_bias, v_bias = jnp.split(in_proj_bias, 3, axis=0)
+
+            # The biases are already in the flattened format (hidden_size,)
+            # which matches our expected axes (__OUT__,) after flattening
+            # No need to reshape since the template is already flattened at this point
+
+            q_proj_bias = hax.named(q_bias, self.q_proj.bias.axes)
+            k_proj_bias = hax.named(k_bias, self.k_proj.bias.axes)
+            v_proj_bias = hax.named(v_bias, self.v_proj.bias.axes)
+        else:
+            q_proj_bias = self.q_proj.bias
+            k_proj_bias = self.k_proj.bias
+            v_proj_bias = self.v_proj.bias
+
+        # Create updated projections
+        q_proj = dataclasses.replace(self.q_proj, weight=q_proj_weight, bias=q_proj_bias)
+        k_proj = dataclasses.replace(self.k_proj, weight=k_proj_weight, bias=k_proj_bias)
+        v_proj = dataclasses.replace(self.v_proj, weight=v_proj_weight, bias=v_proj_bias)
+
+        # Load out_proj using default mechanism
+        out_proj = from_state_dict(self.out_proj, state_dict, with_prefix(prefix, "attention.out_proj"))
+
+        # Load layernorm
+        layernorm = from_state_dict(self.layernorm, state_dict, with_prefix(prefix, "layernorm"))
+
+        # Load MLP
+        mlp = from_state_dict(self.mlp, state_dict, with_prefix(prefix, "mlp"))
+
+        return Siglip2MultiheadAttentionPoolingHead(
+            config=self.config,
+            probe=probe,
+            q_proj=q_proj,
+            k_proj=k_proj,
+            v_proj=v_proj,
+            out_proj=out_proj,
+            layernorm=layernorm,
+            mlp=mlp,
+            inference=self.inference,
+        )
+
+
+# =====================
+# Siglip2 Vision Model (HF-compatible wrapper)
+# =====================
+
+
+class Siglip2VisionModel(ModuleWithStateDictSerialization, ModelWithHfSerializationMixin[Siglip2VisionConfig]):
+    """
+    Siglip2 Vision Model with HuggingFace compatibility.
+
+    This is a wrapper around Siglip2VisionTransformer that implements
+    the ModelWithHfSerializationMixin interface for checkpoint conversion.
+    """
+
+    vision_model: Siglip2VisionTransformer
+
+    @property
+    def config(self) -> Siglip2VisionConfig:
+        return self.vision_model.config
+
+    @property
+    def Vocab(self) -> Axis:
+        # Vision models don't have a vocab, but ModelWithHfSerializationMixin requires it
+        # We use a dummy axis for compatibility
+        return Axis(name="vocab", size=1)
+
+    def get_hf_config(self):
+        """Override to avoid requiring vocab_size for vision models."""
+        return self.config.to_hf_config()
+
+    @classmethod
+    def init(cls, Vocab: Axis, config: Siglip2VisionConfig, *, key) -> "Siglip2VisionModel":
+        """
+        Initialize Siglip2VisionModel.
+
+        Args:
+            Vocab: Dummy vocab axis (not used for vision models, but required by interface)
+            config: Siglip2VisionConfig
+            key: PRNGKey for initialization
+
+        Returns:
+            Initialized Siglip2VisionModel
+        """
+        vision_model = Siglip2VisionTransformer.init(config, key=key)
+        return cls(vision_model=vision_model)
+
+    @named_call
+    def __call__(
+        self,
+        pixel_values: NamedArray,
+        mask: Optional[AttentionMask] = None,
+        spatial_shapes=None,
+        output_hidden_states: bool = False,
+        *,
+        key=None,
+    ) -> Siglip2VisionModelOutput:
+        """
+        Forward pass through vision model.
+
+        Args:
+            pixel_values: Patchified pixel values with shape (..., num_patches, patch_input_dim)
+            mask: Optional attention mask
+            spatial_shapes: Optional array of shape (batch, 2) containing [height, width] in patches
+            output_hidden_states: Whether to return hidden states from all layers
+            key: PRNGKey for dropout
+
+        Returns:
+            Siglip2VisionModelOutput containing last_hidden_state and optionally hidden_states
+        """
+        return self.vision_model(
+            pixel_values, mask=mask, spatial_shapes=spatial_shapes, output_hidden_states=output_hidden_states, key=key
+        )
+
+    def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
+        """Map Levanter field names to HuggingFace state dict keys."""
+        return {}  # Keep vision_model prefix as-is (matches HF structure)
+
+    def from_state_dict(self, state_dict: Dict[str, jnp.ndarray], prefix: Optional[str] = None):
+        """Load from state dict."""
+        from haliax._src.state_dict import default_eqx_module_from_state_dict
+
+        # Use default loading
+        return default_eqx_module_from_state_dict(self, state_dict, prefix)
diff --git a/lib/levanter/src/levanter/models/vlm_model.py b/lib/levanter/src/levanter/models/vlm_model.py
new file mode 100644
index 0000000000..7b2cfda98e
--- /dev/null
+++ b/lib/levanter/src/levanter/models/vlm_model.py
@@ -0,0 +1,164 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Base classes for Vision-Language Models (VLMs) and Vision Encoders.
+
+This module provides abstract base classes for:
+- VisionEncoderConfig: Configuration for vision encoders (e.g., SigLIP, Siglip2)
+- VlmConfig: Configuration for vision-language models that combine vision encoders with LLMs
+"""
+
+import abc
+from dataclasses import dataclass
+from typing import Generic, Optional, Type, TypeVar
+
+import draccus
+from haliax import Axis
+
+from levanter.compat.hf_checkpoints import HFCheckpointConverter
+from levanter.models.lm_model import LmConfig
+
+
+# =====================
+# Vision Encoder Config
+# =====================
+
+VisionEncoderT = TypeVar("VisionEncoderT", bound="VisionEncoderModel")
+
+
+# TODO: for some reason, mypy doesn't like the discover_packages_path argument?
+@dataclass(frozen=True)
+class VisionEncoderConfig(draccus.PluginRegistry, abc.ABC, Generic[VisionEncoderT], discover_packages_path="levanter.models"):  # type: ignore
+    """
+    Abstract base class for vision encoder configurations.
+
+    All vision encoders (e.g., SigLIP, Siglip2) should inherit from this class
+    and register themselves using the @VisionEncoderConfig.register_subclass decorator.
+
+    Example:
+        @VisionEncoderConfig.register_subclass("siglip")
+        @dataclass(frozen=True)
+        class SiglipVisionConfig(VisionEncoderConfig):
+            ...
+    """
+
+    @property
+    @abc.abstractmethod
+    def model_type(cls) -> Type[VisionEncoderT]:
+        """Return the corresponding model class."""
+        pass
+
+    @property
+    @abc.abstractmethod
+    def Embed(self) -> Axis:
+        """The embedding dimension axis."""
+        pass
+
+    @property
+    @abc.abstractmethod
+    def NumPatches(self) -> Axis:
+        """The number of patches axis."""
+        pass
+
+    @abc.abstractmethod
+    def hf_checkpoint_converter(self, ref_checkpoint: Optional[str] = None) -> HFCheckpointConverter:
+        """Create a HuggingFace checkpoint converter for this config."""
+        pass
+
+
+# =====================
+# Vision Encoder Model
+# =====================
+
+
+class VisionEncoderModel(abc.ABC):
+    """
+    Abstract base class for vision encoder models.
+
+    This is a placeholder for type hints. Concrete implementations
+    should inherit from both this class and equinox.Module.
+    """
+
+    pass
+
+
+# =====================
+# VLM Config
+# =====================
+
+VlmT = TypeVar("VlmT", bound="VlmModel")
+
+
+class VlmConfig(LmConfig[VlmT], abc.ABC, Generic[VlmT]):
+    """
+    Abstract base class / interface for Vision-Language Model configurations.
+
+    Defines the interface for VLM configs that combine a vision encoder with a language model.
+
+    IMPORTANT: Due to Python dataclass inheritance rules (fields with defaults cannot be
+    followed by fields without defaults), concrete VLM configs should NOT directly inherit
+    from VlmConfig. Instead, they should:
+    1. Be standalone dataclasses
+    2. Implement the VlmConfig interface via duck typing (define vision_config, text_config,
+       Embed, max_Pos, KeyPos, VisionEmbed, etc.)
+    3. Register with @LmConfig.register_subclass()
+
+    Example:
+        @LmConfig.register_subclass("llava")
+        @dataclass(frozen=True)
+        class LlavaConfig:  # Note: does NOT inherit from VlmConfig
+            vision_config: SiglipVisionConfig
+            text_config: QwenConfig
+            # ... implements VlmConfig interface via properties ...
+    """
+
+    # Subclasses must define the following as dataclass fields:
+    #   - vision_config: VisionEncoderConfig - The vision encoder configuration
+    #   - text_config: LmConfig - The language model configuration
+    #
+    # We don't define them here as properties or annotations because that would
+    # conflict with dataclass field assignment in frozen dataclasses.
+
+    # Delegate to text_config for LmConfig properties
+    @property
+    def Embed(self) -> Axis:
+        """Embedding dimension, delegated to text_config."""
+        return self.text_config.Embed
+
+    @property
+    def max_Pos(self) -> Axis:
+        """Maximum position axis, delegated to text_config."""
+        return self.text_config.max_Pos
+
+    @property
+    def KeyPos(self) -> Axis:
+        """Key position axis, delegated to text_config."""
+        return self.text_config.KeyPos
+
+    # Vision-related properties
+    @property
+    def VisionEmbed(self) -> Axis:
+        """Vision embedding dimension from vision_config."""
+        return self.vision_config.Embed
+
+    @property
+    def NumPatches(self) -> Axis:
+        """Number of patches from vision_config."""
+        return self.vision_config.NumPatches
+
+
+# =====================
+# VLM Model
+# =====================
+
+
+class VlmModel(abc.ABC):
+    """
+    Abstract base class for Vision-Language Models.
+
+    This is a placeholder for type hints. Concrete implementations
+    should inherit from both this class and equinox.Module.
+    """
+
+    pass
diff --git a/lib/levanter/src/levanter/store/cache.py b/lib/levanter/src/levanter/store/cache.py
index c39e36d05b..9b7cadb958 100644
--- a/lib/levanter/src/levanter/store/cache.py
+++ b/lib/levanter/src/levanter/store/cache.py
@@ -622,8 +622,9 @@ async def _extend_cache_metadata_with_other(
         source_num_rows = await source.async_len()
 
         async def _copy_one_array(dest_array: JaggedArrayStore, source_array: JaggedArrayStore, data_offset: int):
-            if source_array.shapes is not None:
-                source_shapes = source_array.shapes
+            source_shapes_store = source_array.shapes
+            if source_shapes_store is not None:
+                source_shapes = source_shapes_store[:source_num_rows]
                 async with ts.Transaction() as txn:
                     dest_shapes = dest_array.shapes
                     assert dest_shapes is not None
diff --git a/lib/levanter/tests/test_attention.py b/lib/levanter/tests/test_attention.py
index 50406c25df..b18a5e21d3 100644
--- a/lib/levanter/tests/test_attention.py
+++ b/lib/levanter/tests/test_attention.py
@@ -381,6 +381,125 @@ def test_tpu_splash_attention():
         assert_trees_all_close(hax_out.array, flash_out.array, atol=1e-3, rtol=1e-3)
 
 
+def test_tpu_splash_attention_explicit_mask_static():
+    """Test that splash attention works with static explicit masks (e.g., padding masks).
+
+    Static masks are numpy arrays known at compile time. This tests consistency
+    between splash attention and vanilla attention when using explicit padding masks.
+    """
+    if jax.default_backend() != "tpu":
+        pytest.skip("TPU only")
+
+    BLOCK_SIZE = 512
+
+    Head = hax.Axis("Head", 8)
+    Key = hax.Axis("Key", 128)  # splash only supports 128
+    QPos = hax.Axis("QPos", BLOCK_SIZE * 2)
+    KPos = hax.Axis("KPos", BLOCK_SIZE * 2)
+
+    # Create a static padding mask: first 3/4 of keys are valid, last 1/4 are padding
+    seq_len = BLOCK_SIZE * 2
+    valid_len = seq_len * 3 // 4
+    padding_mask_np = np.zeros((seq_len, seq_len), dtype=np.bool_)
+    # For each query position, it can attend to valid key positions (respecting causality)
+    for q in range(seq_len):
+        for k in range(min(q + 1, valid_len)):  # causal + padding
+            padding_mask_np[q, k] = True
+
+    padding_mask = hax.named(padding_mask_np, (QPos, KPos))
+    mask = AttentionMask.causal() & AttentionMask.explicit(padding_mask)
+
+    with use_test_mesh():
+        q = hax.random.normal(jrandom.PRNGKey(0), (QPos, Head, Key)) * 0.02
+        k = hax.random.normal(jrandom.PRNGKey(1), (KPos, Head, Key)) * 0.02
+        v = hax.random.normal(jrandom.PRNGKey(2), (KPos, Head, Key)) * 0.02
+
+        flash_out = _tpu_splash_attention(
+            QPos,
+            KPos,
+            Key,
+            q,
+            k,
+            v,
+            inference=True,
+            mask=mask,
+            block_size=BLOCK_SIZE,
+            scaling_factor=1 / math.sqrt(Key.size),
+        )
+        hax_out = hax.nn.attention.dot_product_attention(KPos, Key, q, k, v, mask=mask.materialize(QPos, KPos))
+        assert hax_out.axes == flash_out.axes
+        assert_trees_all_close(hax_out.array, flash_out.array, atol=1e-3, rtol=1e-3)
+
+
+def test_tpu_splash_attention_segment_ids_for_padding():
+    """Test that segment_ids can be used for dynamic padding masks with splash attention.
+
+    Unlike explicit_mask which requires static numpy arrays, segment_ids can be dynamic
+    JAX arrays. This is the recommended approach for VLM training where padding masks
+    depend on batch data.
+
+    Strategy: valid tokens get segment_id=1, padding tokens get segment_id=0.
+    Tokens with different segment_ids cannot attend to each other.
+    """
+    if jax.default_backend() != "tpu":
+        pytest.skip("TPU only")
+
+    BLOCK_SIZE = 512
+
+    Head = hax.Axis("Head", 8)
+    Key = hax.Axis("Key", 128)
+    QPos = hax.Axis("QPos", BLOCK_SIZE * 2)
+    KPos = hax.Axis("KPos", BLOCK_SIZE * 2)
+
+    seq_len = BLOCK_SIZE * 2
+    valid_len = seq_len * 3 // 4  # First 3/4 are valid
+
+    with use_test_mesh():
+        q = hax.random.normal(jrandom.PRNGKey(0), (QPos, Head, Key)) * 0.02
+        k = hax.random.normal(jrandom.PRNGKey(1), (KPos, Head, Key)) * 0.02
+        v = hax.random.normal(jrandom.PRNGKey(2), (KPos, Head, Key)) * 0.02
+
+        # Create segment_ids: 1 for valid, 0 for padding
+        # This can be dynamic (depend on input data) unlike explicit_mask
+        segment_ids_arr = jnp.concatenate(
+            [jnp.ones(valid_len, dtype=jnp.int32), jnp.zeros(seq_len - valid_len, dtype=jnp.int32)]
+        )
+        segment_ids = hax.named(segment_ids_arr, (QPos,))
+
+        # Use segment_ids with causal mask
+        mask = AttentionMask.causal().with_segment_ids(segment_ids)
+
+        flash_out = _tpu_splash_attention(
+            QPos,
+            KPos,
+            Key,
+            q,
+            k,
+            v,
+            inference=True,
+            mask=mask,
+            block_size=BLOCK_SIZE,
+            scaling_factor=1 / math.sqrt(Key.size),
+        )
+
+        # For reference: create equivalent explicit mask for vanilla attention
+        # Valid tokens (segment=1) can attend to other valid tokens (respecting causality)
+        # Padding tokens (segment=0) can only attend to padding tokens
+        ref_mask_np = np.zeros((seq_len, seq_len), dtype=np.bool_)
+        for qi in range(seq_len):
+            for ki in range(qi + 1):  # causal
+                # Same segment can attend to each other
+                q_seg = 1 if qi < valid_len else 0
+                k_seg = 1 if ki < valid_len else 0
+                if q_seg == k_seg:
+                    ref_mask_np[qi, ki] = True
+        ref_mask = hax.named(ref_mask_np, (QPos, KPos))
+
+        hax_out = hax.nn.attention.dot_product_attention(KPos, Key, q, k, v, mask=ref_mask)
+        assert hax_out.axes == flash_out.axes
+        assert_trees_all_close(hax_out.array, flash_out.array, atol=1e-3, rtol=1e-3)
+
+
 def test_tpu_splash_attention_sliding_window():
     if jax.default_backend() != "tpu":
         pytest.skip("TPU only")
diff --git a/lib/levanter/tests/test_image.py b/lib/levanter/tests/test_image.py
new file mode 100644
index 0000000000..3e88e9185b
--- /dev/null
+++ b/lib/levanter/tests/test_image.py
@@ -0,0 +1,470 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import tempfile
+
+import pytest
+from transformers import AutoProcessor
+
+from levanter.data.image import BatchImageProcessor, load_image
+from levanter.store.cache import SerialCacheWriter
+import jax
+import jax.numpy as jnp
+
+from test_image_utils import get_real_data, DEFAULT_GRID_PINPOINTS
+import numpy as np
+import haliax as hax
+from jax.sharding import Mesh
+
+
+@pytest.fixture
+def processor():
+    return AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-si-hf")
+
+
+@pytest.fixture
+def dataset():
+    return get_real_data()
+
+
+def test_batch_image_processor(processor, dataset):
+    """Test core BatchImageProcessor functionality."""
+    batch_processor = BatchImageProcessor(
+        processor,
+        max_length=2048,
+        padding=True,
+        messages_key="messages",
+        images_key="images",
+        mask_prompt=False,
+    )
+    examples = [dataset[i] for i in range(4)]
+    results = batch_processor(examples)
+
+    assert len(results) == 4
+    for result in results:
+        assert "pixel_values" in result
+        assert "input_ids" in result
+        assert "attention_mask" in result
+        assert "loss_mask" in result
+        assert result["input_ids"].shape == (2048,)
+
+
+def test_image_data_loader(processor, dataset):
+    """Test ImageDataLoader with cached data."""
+    from levanter.data.image import ImageDataLoader, ImageTextExample
+
+    batch_processor = BatchImageProcessor(
+        processor,
+        max_length=2048,
+        padding=True,
+        messages_key="messages",
+        images_key="images",
+        mask_prompt=False,
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with SerialCacheWriter(tmpdir, batch_processor.output_exemplar) as writer:
+            for i in range(8):
+                example = dataset[i]
+                try:
+                    results = batch_processor([example])
+                    writer.write_batch(results)
+                except ValueError as e:
+                    if "Mismatch in `image` token count" in str(e):
+                        continue
+                    raise
+
+        cache = writer.result()
+        cache_len = len(cache)
+
+        if cache_len < 2:
+            pytest.skip("Not enough examples cached for dataloader test")
+
+        all_examples = cache.get_batch_sync(list(range(cache_len)))
+        max_num_patches = max(ex["pixel_values"].shape[0] for ex in all_examples)
+        first_ex = all_examples[0]
+        seq_len = first_ex["input_ids"].shape[0]
+
+        Pos = hax.Axis("position", seq_len)
+        NumPatches = hax.Axis("num_patches", max_num_patches)
+        Channels = hax.Axis("channels", 3)
+        Height = hax.Axis("height", first_ex["pixel_values"].shape[2])
+        Width = hax.Axis("width", first_ex["pixel_values"].shape[3])
+
+        devices = np.array(jax.devices("cpu")[:1])
+        mesh = Mesh(devices, ("data",))
+
+        batch_size = min(2, cache_len)
+        axis_resources = {"batch": "data"}
+
+        with mesh:
+            loader = ImageDataLoader(
+                data=cache,
+                batch_size=batch_size,
+                Pos=Pos,
+                NumPatches=NumPatches,
+                Channels=Channels,
+                Height=Height,
+                Width=Width,
+                mesh=mesh,
+                axis_resources=axis_resources,
+                max_buffered_batches=0,
+            )
+
+            batch = next(iter(loader))
+            assert isinstance(batch, ImageTextExample)
+            assert batch.pixel_values.array.shape[0] == batch_size
+            assert batch.input_ids.array.shape[0] == batch_size
+
+
+def test_llava_with_image_dataloader(processor, dataset):
+    """Test LLaVA OneVision model - compare HF and Levanter outputs."""
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    import dataclasses
+    import torch
+    from levanter.data.image import ImageDataLoader, ImageTextExample, create_custom_processor
+    from levanter.models.llava_onevision import LlavaOnevisionConfig, LlavaOnevisionModel
+    from levanter.layers.attention import AttentionBackend
+    from levanter.trainer import TrainerConfig
+    from transformers import LlavaOnevisionForConditionalGeneration as HfLlavaOnevision, AutoConfig
+    import equinox as eqx
+
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    image_processor = processor.image_processor
+    grid_pinpoints = getattr(image_processor, "image_grid_pinpoints", None)
+    patch_size = getattr(image_processor, "size", {}).get("height", 384)
+    vision_feature_height = patch_size // 14
+    vision_aspect_ratio = getattr(image_processor, "vision_aspect_ratio", "anyres_max_9")
+    max_num_patches = None
+    if vision_aspect_ratio and "anyres_max_" in vision_aspect_ratio:
+        max_num_patches = int(vision_aspect_ratio.split("anyres_max_")[-1])
+
+    padded_processor = create_custom_processor(model_name, do_pad=True, image_grid_pinpoints=grid_pinpoints)
+    unpadded_processor = create_custom_processor(model_name, do_pad=False, image_grid_pinpoints=grid_pinpoints)
+
+    batch_processor = BatchImageProcessor(
+        padded_processor,
+        max_length=2048,
+        padding=True,
+        messages_key="messages",
+        images_key="images",
+        mask_prompt=False,
+        grid_pinpoints=grid_pinpoints,
+        patch_size=patch_size,
+        vision_feature_height=vision_feature_height,
+        max_num_patches=max_num_patches,
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cached_dataset_indices = []
+        with SerialCacheWriter(tmpdir, batch_processor.output_exemplar) as writer:
+            for i in range(8):
+                example = dataset[i]
+                try:
+                    results = batch_processor([example])
+                    writer.write_batch(results)
+                    cached_dataset_indices.append(i)
+                except ValueError as e:
+                    if "Mismatch in `image` token count" in str(e):
+                        continue
+                    raise
+
+        cache = writer.result()
+        cache_len = len(cache)
+
+        if cache_len < 2:
+            pytest.skip("Not enough examples cached for test")
+
+        all_examples = cache.get_batch_sync(list(range(cache_len)))
+        max_num_patches_actual = max(ex["pixel_values"].shape[0] for ex in all_examples)
+        first_ex = all_examples[0]
+        seq_len = first_ex["input_ids"].shape[0]
+
+        Pos = hax.Axis("position", seq_len)
+        NumPatches = hax.Axis("num_patches", max_num_patches_actual)
+        Channels = hax.Axis("channels", 3)
+        Height = hax.Axis("height", first_ex["pixel_values"].shape[2])
+        Width = hax.Axis("width", first_ex["pixel_values"].shape[3])
+        features_per_patch = vision_feature_height * vision_feature_height
+        max_image_tokens = max_num_patches_actual * features_per_patch
+        NumImageTokens = hax.Axis("num_image_tokens", max_image_tokens)
+
+        # Load HF model
+        hf_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+        hf_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+        hf_model.model.image_newline = None
+        hf_model.eval()
+
+        # Load Levanter model
+        hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        config = LlavaOnevisionConfig.from_hf_config(hf_config)
+        vision_config_updated = dataclasses.replace(
+            config.vision_config,
+            use_flash_attention=False,
+            attn_backend=AttentionBackend.VANILLA,
+            gradient_checkpointing=False,
+        )
+        text_config_updated = dataclasses.replace(
+            config.text_config, attn_backend=AttentionBackend.VANILLA, gradient_checkpointing=False
+        )
+        config = dataclasses.replace(
+            config, vision_config=vision_config_updated, text_config=text_config_updated, gradient_checkpointing=False
+        )
+
+        trainer_config = TrainerConfig()
+
+        with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+            converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+            lev_model = converter.load_pretrained(
+                LlavaOnevisionModel,
+                ref=model_name,
+                config=config,
+                axis_mapping=trainer_config.parameter_axis_mapping,
+                dtype=jnp.float32,
+                resize_vocab_to_match_tokenizer=False,
+            )
+
+            batch_size = min(4, cache_len)
+            from jax._src.mesh import get_concrete_mesh
+
+            mesh = get_concrete_mesh()
+
+            loader = ImageDataLoader(
+                data=cache,
+                batch_size=batch_size,
+                Pos=Pos,
+                NumPatches=NumPatches,
+                Channels=Channels,
+                Height=Height,
+                Width=Width,
+                axis_resources=trainer_config.compute_axis_mapping,
+                mesh=mesh,
+                max_buffered_batches=0,
+                allow_nondivisible_batch_size=True,
+                NumImageTokens=NumImageTokens,
+            )
+
+            batch = next(iter(loader))
+            assert isinstance(batch, ImageTextExample)
+
+            batch_input_ids = np.array(batch.input_ids.array)
+            batch_pixel_values = np.array(batch.pixel_values.array)
+            batch_grid_mask = np.array(batch.grid_mask.array) if batch.grid_mask is not None else None
+
+            image_token_id = hf_model.config.image_token_index
+
+            # HF forward pass
+            hf_logits_list = []
+            hf_input_ids_list = []
+            hf_image_sizes_list = []
+            for sample_idx in range(batch_size):
+                dataset_idx = cached_dataset_indices[sample_idx]
+                raw_example = dataset[dataset_idx]
+                messages = raw_example["messages"]
+                images = raw_example.get("images", None)
+                prompt_text = unpadded_processor.apply_chat_template(messages, add_generation_prompt=False)
+
+                if images is not None and len(images) > 0:
+                    pil_images = [load_image(img) for img in images]
+                    hf_inputs = unpadded_processor(text=prompt_text, images=pil_images, return_tensors="pt")
+                else:
+                    hf_inputs = unpadded_processor(text=prompt_text, return_tensors="pt")
+
+                hf_input_ids = hf_inputs["input_ids"]
+                hf_input_ids_list.append(hf_input_ids[0].numpy())
+                hf_image_sizes_list.append(hf_inputs.get("image_sizes"))
+
+                with torch.no_grad():
+                    hf_output = hf_model(**hf_inputs)
+                    hf_logits_list.append(hf_output.logits[0].numpy())
+
+            # Levanter forward pass
+            @eqx.filter_jit
+            def compute_forward_single(model, input_ids, pixel_values, grid_mask, unpad_indices):
+                return model(
+                    input_ids, pixel_values=pixel_values, grid_mask=grid_mask, unpad_indices=unpad_indices, key=None
+                )
+
+            lev_logits_list = []
+            for sample_idx in range(batch_size):
+                input_ids_np = batch_input_ids[sample_idx : sample_idx + 1]
+                pixel_values_np = batch_pixel_values[sample_idx : sample_idx + 1]
+                grid_mask_np = batch_grid_mask[sample_idx : sample_idx + 1] if batch_grid_mask is not None else None
+                has_image = grid_mask_np is not None and grid_mask_np[0].any()
+
+                Batch1 = hax.Axis("batch", 1)
+                input_ids_lev = hax.named(jnp.array(input_ids_np, dtype=jnp.int32), (Batch1, Pos))
+                pixel_values_lev = hax.named(
+                    jnp.array(pixel_values_np, dtype=jnp.float32), (Batch1, NumPatches, Channels, Height, Width)
+                )
+                grid_mask_lev = (
+                    hax.named(jnp.array(grid_mask_np, dtype=jnp.bool_), (Batch1, NumPatches))
+                    if grid_mask_np is not None
+                    else None
+                )
+
+                if has_image:
+                    hf_ids = hf_input_ids_list[sample_idx]
+                    num_hf_image_tokens = (hf_ids == image_token_id).sum()
+                    hf_image_sizes = hf_image_sizes_list[sample_idx]
+                    image_sizes_list = [hf_image_sizes[0].tolist()]
+                    unpad_indices_np = padded_processor.compute_unpad_indices(
+                        image_sizes=image_sizes_list,
+                        height=patch_size,
+                        width=patch_size,
+                        max_num_features=int(num_hf_image_tokens),
+                    )
+                    NumImageTokensSample = hax.Axis("num_image_tokens", int(num_hf_image_tokens))
+                    unpad_indices_lev = hax.named(
+                        jnp.array(unpad_indices_np, dtype=jnp.int32), (Batch1, NumImageTokensSample)
+                    )
+                else:
+                    unpad_indices_lev = None
+
+                lev_logits_sample = compute_forward_single(
+                    lev_model, input_ids_lev, pixel_values_lev, grid_mask_lev, unpad_indices_lev
+                )
+                lev_logits_sample.array.block_until_ready()
+                lev_logits_list.append(np.array(lev_logits_sample.array)[0])
+
+            # Compare outputs
+            all_correlations = []
+            all_pred_match_rates = []
+            for sample_idx in range(batch_size):
+                hf_logit = hf_logits_list[sample_idx]
+                lev_logit = lev_logits_list[sample_idx]
+                min_len = min(len(hf_logit), len(lev_logit))
+                hf_compare = hf_logit[:min_len]
+                lev_compare = lev_logit[:min_len]
+
+                correlation = np.corrcoef(hf_compare.flatten(), lev_compare.flatten())[0, 1]
+                all_correlations.append(correlation)
+
+                hf_preds = np.argmax(hf_compare, axis=-1)
+                lev_preds = np.argmax(lev_compare, axis=-1)
+                pred_match_rate = np.mean(hf_preds == lev_preds)
+                all_pred_match_rates.append(pred_match_rate)
+
+            avg_correlation = np.mean(all_correlations)
+            avg_pred_match = np.mean(all_pred_match_rates)
+
+            assert avg_correlation > 0.99, f"Average correlation too low: {avg_correlation}"
+            assert avg_pred_match > 0.90, f"Average prediction match too low: {avg_pred_match}"
+
+
+def test_cache_vs_streaming_data_consistency():
+    """Test that cache mode and streaming mode produce identical data."""
+    import asyncio
+    from levanter.data.image import ImageMixtureDatasetConfig, ConversationDatasetSourceConfig
+
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hf_dataset = get_real_data()
+        parquet_path = f"{tmpdir}/test_data.parquet"
+        hf_dataset.to_parquet(parquet_path)
+
+        cache_config = ImageMixtureDatasetConfig(
+            cache_dir=f"{tmpdir}/cache",
+            configs={
+                "train": ConversationDatasetSourceConfig(
+                    train_urls=[f"file://{parquet_path}"],
+                    validation_urls=[f"file://{parquet_path}"],
+                    cache_dir=f"{tmpdir}/cache/train",
+                )
+            },
+            train_weights={"train": 1.0},
+            processor=model_name,
+            max_length=8192,
+            use_cache=True,
+        )
+
+        cache_datasets = cache_config.training_sets()
+        cache_dataset = list(cache_datasets.values())[0]
+        cache_len = asyncio.run(cache_dataset.async_len())
+
+        streaming_config = ImageMixtureDatasetConfig(
+            cache_dir=f"{tmpdir}/streaming_cache",
+            configs={
+                "train": ConversationDatasetSourceConfig(
+                    train_urls=[f"file://{parquet_path}"],
+                    validation_urls=[f"file://{parquet_path}"],
+                    cache_dir=f"{tmpdir}/streaming_cache/train",
+                )
+            },
+            train_weights={"train": 1.0},
+            processor=model_name,
+            max_length=8192,
+            use_cache=False,
+        )
+
+        streaming_datasets = streaming_config.training_sets()
+        streaming_dataset = list(streaming_datasets.values())[0]
+        streaming_len = asyncio.run(streaming_dataset.async_len())
+
+        assert cache_len == streaming_len
+
+        num_to_compare = min(10, cache_len)
+        indices = list(range(num_to_compare))
+        cache_examples = asyncio.run(cache_dataset.get_batch(indices))
+        streaming_examples = asyncio.run(streaming_dataset.get_batch(indices))
+
+        for i in range(num_to_compare):
+            cache_ex = cache_examples[i]
+            streaming_ex = streaming_examples[i]
+            assert np.array_equal(cache_ex["input_ids"], streaming_ex["input_ids"])
+            assert np.array_equal(cache_ex["attention_mask"], streaming_ex["attention_mask"])
+            pixel_max_diff = np.abs(cache_ex["pixel_values"] - streaming_ex["pixel_values"]).max()
+            assert pixel_max_diff < 1e-5
+
+
+def test_streaming_dataset_basic():
+    """Test StreamingImageDataset functionality."""
+    import asyncio
+    from levanter.data.image import ImageMixtureDatasetConfig, ConversationDatasetSourceConfig, StreamingImageDataset
+
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hf_dataset = get_real_data()
+        parquet_path = f"{tmpdir}/test_data.parquet"
+        hf_dataset.to_parquet(parquet_path)
+
+        config = ImageMixtureDatasetConfig(
+            cache_dir=f"{tmpdir}/cache",
+            configs={
+                "train": ConversationDatasetSourceConfig(
+                    train_urls=[f"file://{parquet_path}"],
+                    validation_urls=[f"file://{parquet_path}"],
+                    cache_dir=f"{tmpdir}/cache/train",
+                )
+            },
+            train_weights={"train": 1.0},
+            processor=model_name,
+            max_length=2048,
+            use_cache=False,
+        )
+
+        datasets = config.training_sets()
+        dataset = list(datasets.values())[0]
+        assert isinstance(dataset, StreamingImageDataset)
+
+        async def run_tests():
+            length = await dataset.async_len()
+            assert length > 0
+            assert dataset.is_finite()
+            batch = await dataset.get_batch([0, 1, 2])
+            assert len(batch) == 3
+            for ex in batch:
+                assert "input_ids" in ex
+                assert "pixel_values" in ex
+            return True
+
+        result = asyncio.run(run_tests())
+        assert result
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/lib/levanter/tests/test_image_utils.py b/lib/levanter/tests/test_image_utils.py
new file mode 100644
index 0000000000..5c40e362e4
--- /dev/null
+++ b/lib/levanter/tests/test_image_utils.py
@@ -0,0 +1,857 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Test utilities for VLM (Vision-Language Model) testing.
+
+Provides unified data preparation for HF vs Levanter comparison testing.
+Uses Levanter's BatchImageProcessor for consistent processing with grid_mask support.
+"""
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Optional, Any
+import numpy as np
+from PIL import Image
+from PIL import Image as PILImage
+
+from datasets import load_dataset
+
+
+# =============================================================================
+# HuggingFace Test Dataset Loading
+# =============================================================================
+# Utility functions for loading test data from HuggingFace dataset.
+# Uses the dataset `ruili0/demo-vlm-test-dataset` instead of local files.
+#
+# Dataset splits:
+# - single_image: 4 samples with single image QA (Stanford University entrance)
+# - multi_image: 4 samples with multi-image QA (same image used twice)
+# - real_data: 20 samples from real multimodal dataset
+# =============================================================================
+
+HF_DATASET = "ruili0/demo-vlm-test-dataset"
+
+
+@lru_cache(maxsize=1)
+def _load_single_image_split():
+    """Cache the single_image split to avoid repeated downloads."""
+    return load_dataset(HF_DATASET, split="single_image")
+
+
+@lru_cache(maxsize=1)
+def _load_multi_image_split():
+    """Cache the multi_image split to avoid repeated downloads."""
+    return load_dataset(HF_DATASET, split="multi_image")
+
+
+@lru_cache(maxsize=1)
+def _load_real_data_split():
+    """Cache the real_data split to avoid repeated downloads."""
+    return load_dataset(HF_DATASET, split="real_data")
+
+
+def get_single_image() -> PILImage.Image:
+    """Get a single test image from HF dataset.
+
+    Returns:
+        PIL Image of Stanford University entrance.
+    """
+    ds = _load_single_image_split()
+    return ds[0]["images"][0]
+
+
+def get_multi_images() -> list[PILImage.Image]:
+    """Get multi-image test data from HF dataset.
+
+    Returns:
+        List of 2 PIL Images (same image twice, for multi-image testing).
+    """
+    ds = _load_multi_image_split()
+    return ds[0]["images"]
+
+
+def get_real_data(num_samples: int = 20):
+    """Get real test data from HF dataset.
+
+    Args:
+        num_samples: Number of samples to return (default 20, max 20).
+
+    Returns:
+        HuggingFace Dataset with messages and images columns.
+    """
+    ds = _load_real_data_split()
+    return ds.select(range(min(num_samples, len(ds))))
+
+
+def get_single_image_conversations():
+    """Get single image QA conversations.
+
+    Returns:
+        HuggingFace Dataset with 4 single-image QA samples.
+    """
+    return _load_single_image_split()
+
+
+def get_multi_image_conversations():
+    """Get multi-image QA conversations.
+
+    Returns:
+        HuggingFace Dataset with 4 multi-image QA samples.
+    """
+    return _load_multi_image_split()
+
+
+def get_test_conversation(split: str = "single_image", index: int = 0) -> dict:
+    """Get a specific test conversation.
+
+    Args:
+        split: One of "single_image", "multi_image", or "real_data".
+        index: Index of the sample to return.
+
+    Returns:
+        Dict with "messages" and "images" keys.
+    """
+    if split == "single_image":
+        ds = _load_single_image_split()
+    elif split == "multi_image":
+        ds = _load_multi_image_split()
+    elif split == "real_data":
+        ds = _load_real_data_split()
+    else:
+        raise ValueError(f"Unknown split: {split}. Use 'single_image', 'multi_image', or 'real_data'.")
+
+    return ds[index]
+
+
+def clear_cache():
+    """Clear the cached datasets (useful for testing)."""
+    _load_single_image_split.cache_clear()
+    _load_multi_image_split.cache_clear()
+    _load_real_data_split.cache_clear()
+
+
+# =============================================================================
+# Test Data Structures
+# =============================================================================
+# These dataclasses are used for testing HF vs Levanter implementation comparison.
+# They mirror the production structures (ImageTextDict, ImageTextExample) but are
+# kept separate for clarity in test code and to maintain both HF and Levanter
+# output formats side by side.
+#
+# Production equivalents:
+# - LevProcessedData ≈ ImageTextDict (from levanter.data.image)
+# - LevJaxTensors ≈ ImageTextExample (from levanter.data.image)
+# - HFProcessedData is test-only (for comparing with HF model outputs)
+# =============================================================================
+
+
+@dataclass
+class HFProcessedData:
+    """HF processor output (no padding, variable shape).
+
+    This represents the output format from HuggingFace's processor
+    with do_pad=False, resulting in variable-length sequences.
+    Used for comparing HF model outputs with Levanter.
+    """
+
+    input_ids: np.ndarray  # (seq_len,)
+    pixel_values: np.ndarray  # (num_patches, C, H, W) - variable num_patches
+    attention_mask: np.ndarray  # (seq_len,)
+    image_sizes: np.ndarray  # (num_images, 2) - (height, width) per image
+
+
+@dataclass
+class LevProcessedData:
+    """Levanter processor output (padded, fixed shape with grid_mask).
+
+    This represents the output format for Levanter's JIT-compatible processing
+    with fixed shapes and grid_mask for indicating valid patches.
+    """
+
+    input_ids: np.ndarray  # (seq_len,)
+    pixel_values: np.ndarray  # (TOTAL_PATCHES, C, H, W) - fixed size, padded
+    attention_mask: np.ndarray  # (seq_len,)
+    grid_mask: np.ndarray  # (TOTAL_PATCHES,) - True for valid patches
+    unpad_indices: Optional[np.ndarray]  # (num_image_tokens,) - for HF compatibility
+    loss_mask: np.ndarray  # (seq_len,) float32 - 1.0 for compute loss, 0.0 for ignore
+
+
+@dataclass
+class TestDataPair:
+    """Paired HF and Levanter data for comparison testing.
+
+    This provides both formats from the same source data,
+    enabling direct comparison between HF and Levanter implementations.
+    """
+
+    hf: HFProcessedData
+    lev: LevProcessedData
+    raw_images: list[Image.Image]  # Original PIL images for reference
+    messages: list[dict[str, Any]]  # Original messages from dataset
+
+
+# Default grid pinpoints for anyres_max_9 configuration
+DEFAULT_GRID_PINPOINTS = [
+    [384, 384],
+    [384, 768],
+    [384, 1152],
+    [768, 384],
+    [768, 768],
+    [768, 1152],
+    [1152, 384],
+    [1152, 768],
+    [1152, 1152],
+]
+
+
+def _create_processors(
+    model_name: str,
+    grid_pinpoints: list[list[int]],
+    max_length: int,
+    max_num_patches: int,
+    patch_size: int,
+    vision_feature_height: int,
+    add_generation_prompt: bool = False,
+):
+    """Create HF and Levanter processors for test data preparation.
+
+    Returns:
+        Tuple of (hf_processor, lev_batch_processor)
+    """
+    # Try to import custom processor first (for proper do_pad support)
+    try:
+        from levanter.data.image import create_custom_processor
+
+        # HF processor with do_pad=True and max_image_tiles for padding_mode support
+        hf_processor = create_custom_processor(
+            model_name,
+            do_pad=True,
+            image_grid_pinpoints=grid_pinpoints,
+            max_image_tiles=max_num_patches + 1,  # e.g., 10 for anyres_max_9
+        )
+
+        # Levanter processor with do_pad=True (padding, fixed shape)
+        lev_processor = create_custom_processor(
+            model_name,
+            do_pad=True,
+            image_grid_pinpoints=grid_pinpoints,
+            max_image_tiles=max_num_patches + 1,
+        )
+    except ImportError:
+        # Fallback to standard AutoProcessor
+        from transformers import AutoProcessor
+
+        hf_processor = AutoProcessor.from_pretrained(model_name)
+        lev_processor = AutoProcessor.from_pretrained(model_name)
+
+    # Wrap Levanter processor in BatchImageProcessor for consistent grid_mask handling
+    from levanter.data.image import BatchImageProcessor
+
+    lev_batch_processor = BatchImageProcessor(
+        processor=lev_processor,
+        max_length=max_length,
+        padding=True,
+        max_num_patches=max_num_patches,
+        grid_pinpoints=grid_pinpoints,
+        patch_size=patch_size,
+        vision_feature_height=vision_feature_height,
+        add_generation_prompt=add_generation_prompt,
+    )
+
+    return hf_processor, lev_batch_processor
+
+
+def prepare_test_data(
+    parquet_path: str,
+    sample_indices: list[int],
+    model_name: str = "llava-hf/llava-onevision-qwen2-0.5b-si-hf",
+    max_length: int = 8192,
+    max_num_patches: int = 9,
+    grid_pinpoints: Optional[list[list[int]]] = None,
+    patch_size: int = 384,
+    vision_feature_height: int = 27,
+    add_generation_prompt: bool = False,
+) -> list[TestDataPair]:
+    """
+    Prepare test data pairs for HF vs Levanter comparison.
+
+    Uses Levanter's BatchImageProcessor for the Levanter format (with grid_mask),
+    and raw HF processor for the HF format (no padding).
+
+    This function uses create_custom_processor from levanter.data.image
+    to ensure proper do_pad handling for HF (do_pad=False) and Levanter (do_pad=True).
+
+    Args:
+        parquet_path: Path to parquet dataset file
+        sample_indices: List of sample indices to process
+        model_name: HuggingFace model name for processor
+        max_length: Maximum sequence length for tokenization
+        max_num_patches: Maximum number of patches for anyres (e.g., 9 for anyres_max_9)
+        grid_pinpoints: Grid resolutions for anyres processing.
+                        If None, uses DEFAULT_GRID_PINPOINTS.
+        patch_size: Size of each image patch (default 384)
+        vision_feature_height: Vision encoder output tokens per spatial dim (default 27 = 384/14)
+        add_generation_prompt: Whether to add generation prompt (default False)
+
+    Returns:
+        List of TestDataPair, one per sample index
+
+    Example:
+        >>> test_pairs = prepare_test_data(
+        ...     parquet_path="data/train.parquet",
+        ...     sample_indices=[0, 1, 2, 3],
+        ... )
+        >>> for pair in test_pairs:
+        ...     # HF data is unpadded
+        ...     print(f"HF pixel_values shape: {pair.hf.pixel_values.shape}")
+        ...     # Levanter data is padded with grid_mask
+        ...     print(f"Lev pixel_values shape: {pair.lev.pixel_values.shape}")
+        ...     print(f"Lev grid_mask: {pair.lev.grid_mask.sum()} valid patches")
+    """
+    from levanter.data.image import load_image
+
+    # Use default grid_pinpoints if not provided
+    if grid_pinpoints is None:
+        grid_pinpoints = DEFAULT_GRID_PINPOINTS
+
+    # Load dataset
+    dataset = load_dataset("parquet", data_files=parquet_path, split="train")
+
+    # Create processors
+    hf_processor, lev_batch_processor = _create_processors(
+        model_name=model_name,
+        grid_pinpoints=grid_pinpoints,
+        max_length=max_length,
+        max_num_patches=max_num_patches,
+        patch_size=patch_size,
+        vision_feature_height=vision_feature_height,
+        add_generation_prompt=add_generation_prompt,
+    )
+
+    results = []
+    for idx in sample_indices:
+        example = dataset[idx]
+        messages = example["messages"]
+        images_data = example.get("images", [])
+
+        # Load raw images
+        raw_images = [load_image(img) for img in images_data]
+
+        # --- HF Processing (no padding, variable shape) ---
+        hf_text = hf_processor.apply_chat_template(messages, add_generation_prompt=add_generation_prompt)
+        hf_processed = hf_processor(
+            images=raw_images,
+            text=hf_text,
+            return_tensors="np",
+            padding=False,
+            truncation=True,
+            max_length=max_length,
+        )
+
+        hf_data = HFProcessedData(
+            input_ids=hf_processed["input_ids"][0],
+            pixel_values=hf_processed["pixel_values"][0],
+            attention_mask=hf_processed["attention_mask"][0],
+            image_sizes=hf_processed["image_sizes"][0],
+        )
+
+        # --- Levanter Processing (with padding + grid_mask) ---
+        # Use BatchImageProcessor for consistent processing
+        lev_results = lev_batch_processor([example])
+        lev_result = lev_results[0]  # ImageTextDict
+
+        lev_data = LevProcessedData(
+            input_ids=lev_result["input_ids"],
+            pixel_values=lev_result["pixel_values"],
+            attention_mask=lev_result["attention_mask"],
+            grid_mask=lev_result["grid_mask"],
+            unpad_indices=lev_result.get("unpad_indices"),
+            loss_mask=lev_result["loss_mask"],
+        )
+
+        results.append(
+            TestDataPair(
+                hf=hf_data,
+                lev=lev_data,
+                raw_images=raw_images,
+                messages=messages,
+            )
+        )
+
+    return results
+
+
+def prepare_test_data_single(
+    messages: list[dict[str, Any]],
+    images: list[Image.Image],
+    model_name: str = "llava-hf/llava-onevision-qwen2-0.5b-si-hf",
+    max_length: int = 8192,
+    max_num_patches: int = 9,
+    grid_pinpoints: Optional[list[list[int]]] = None,
+    patch_size: int = 384,
+    vision_feature_height: int = 27,
+    add_generation_prompt: bool = False,
+) -> TestDataPair:
+    """
+    Prepare a single test data pair from messages and images directly.
+
+    This is useful when you have raw messages and images rather than a parquet file.
+
+    Args:
+        messages: List of message dicts in conversation format
+        images: List of PIL Image objects
+        model_name: HuggingFace model name for processor
+        max_length: Maximum sequence length for tokenization
+        max_num_patches: Maximum number of patches for anyres
+        grid_pinpoints: Grid resolutions for anyres processing
+        patch_size: Size of each image patch
+        vision_feature_height: Vision encoder output tokens per spatial dim
+        add_generation_prompt: Whether to add generation prompt (default False)
+
+    Returns:
+        TestDataPair with both HF and Levanter formats
+
+    Example:
+        >>> from PIL import Image
+        >>> messages = [
+        ...     {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What is this?"}]},
+        ...     {"role": "assistant", "content": [{"type": "text", "text": "A cat."}]}
+        ... ]
+        >>> images = [Image.open("cat.jpg")]
+        >>> pair = prepare_test_data_single(messages, images)
+    """
+    # Use default grid_pinpoints if not provided
+    if grid_pinpoints is None:
+        grid_pinpoints = DEFAULT_GRID_PINPOINTS
+
+    # Create processors using the same logic as prepare_test_data
+    hf_processor, lev_batch_processor = _create_processors(
+        model_name=model_name,
+        grid_pinpoints=grid_pinpoints,
+        max_length=max_length,
+        max_num_patches=max_num_patches,
+        patch_size=patch_size,
+        vision_feature_height=vision_feature_height,
+        add_generation_prompt=add_generation_prompt,
+    )
+
+    # --- HF Processing (NO padding - HF model uses dynamic shapes) ---
+    hf_text = hf_processor.apply_chat_template(messages, add_generation_prompt=add_generation_prompt)
+    is_multi_image = len(images) > 1
+
+    hf_processed = hf_processor(
+        images=images,
+        text=hf_text,
+        return_tensors="np",
+        padding=False,
+        truncation=True,
+        max_length=max_length,
+        padding_mode=False,  # HF model doesn't need padding
+    )
+
+    # Handle multi-image: pixel_values may be (num_images, patches, C, H, W)
+    hf_pixel_values = hf_processed["pixel_values"]
+    hf_image_sizes = hf_processed["image_sizes"]
+
+    # For single image, extract from batch dimension
+    # Multi-image keeps 5D format: (num_images, patches, C, H, W)
+    if not (isinstance(hf_pixel_values, np.ndarray) and hf_pixel_values.ndim == 5 and is_multi_image):
+        hf_pixel_values = hf_pixel_values[0]
+        hf_image_sizes = hf_image_sizes[0]
+
+    hf_data = HFProcessedData(
+        input_ids=hf_processed["input_ids"][0],
+        pixel_values=hf_pixel_values,
+        attention_mask=hf_processed["attention_mask"][0],
+        image_sizes=hf_image_sizes,
+    )
+
+    # --- Levanter Processing (with padding + grid_mask) ---
+    example = {"messages": messages, "images": images}
+    lev_results = lev_batch_processor([example])
+    lev_result = lev_results[0]
+
+    lev_data = LevProcessedData(
+        input_ids=lev_result["input_ids"],
+        pixel_values=lev_result["pixel_values"],
+        attention_mask=lev_result["attention_mask"],
+        grid_mask=lev_result["grid_mask"],
+        unpad_indices=lev_result.get("unpad_indices"),
+        loss_mask=lev_result["loss_mask"],
+    )
+
+    return TestDataPair(
+        hf=hf_data,
+        lev=lev_data,
+        raw_images=images,
+        messages=messages,
+    )
+
+
+def create_grid_mask(actual_patches: int, total_patches: int) -> np.ndarray:
+    """Create grid mask for fixed-shape image processing.
+
+    This function creates a boolean mask indicating which patches are valid (True)
+    vs padding (False). Used for JIT-compatible VLM training.
+
+    Args:
+        actual_patches: Number of actual valid patches from the image
+        total_patches: Total number of patches (including padding slots)
+
+    Returns:
+        Boolean array of shape (total_patches,) where True indicates valid patches
+    """
+    grid_mask = np.zeros(total_patches, dtype=np.bool_)
+    grid_mask[:actual_patches] = True
+    return grid_mask
+
+
+def pad_pixel_values(pixel_values: np.ndarray, total_patches: int) -> np.ndarray:
+    """Pad pixel_values to fixed total_patches size.
+
+    This function pads the pixel_values array to have a fixed number of patches,
+    enabling JIT-compatible fixed-shape processing.
+
+    Args:
+        pixel_values: Array of shape (actual_patches, C, H, W)
+        total_patches: Target number of patches
+
+    Returns:
+        Padded array of shape (total_patches, C, H, W)
+    """
+    actual_patches = pixel_values.shape[0]
+    if actual_patches >= total_patches:
+        return pixel_values[:total_patches]
+    pad_shape = (total_patches - actual_patches,) + pixel_values.shape[1:]
+    padding = np.zeros(pad_shape, dtype=pixel_values.dtype)
+    return np.concatenate([pixel_values, padding], axis=0)
+
+
+def get_actual_patches_from_grid_mask(grid_mask: np.ndarray) -> int:
+    """Get the number of actual (non-padding) patches from a grid_mask.
+
+    Args:
+        grid_mask: Boolean array where True indicates valid patches
+
+    Returns:
+        Number of valid patches
+    """
+    return int(grid_mask.sum())
+
+
+@dataclass
+class LogitsComparisonResult:
+    """Result of comparing logits between HF and Levanter.
+
+    When detailed=True, contains statistics for each region (pre-image, image, post-image).
+    When detailed=False, only overall_mean_diff and overall_max_diff are populated.
+    """
+
+    overall_mean_diff: float
+    overall_max_diff: float
+    passed: bool
+    # Detailed fields (only populated when detailed=True)
+    pre_image_mean_diff: float = 0.0
+    pre_image_max_diff: float = 0.0
+    image_mean_diff: float = 0.0
+    image_max_diff: float = 0.0
+    post_image_mean_diff: float = 0.0
+    post_image_max_diff: float = 0.0
+    details: Optional[dict[str, Any]] = None
+
+
+def compare_logits_by_region(
+    hf_logits: np.ndarray,
+    lev_logits: np.ndarray,
+    input_ids: np.ndarray,
+    image_token_id: int,
+    tolerance: float = 1e-2,
+    verbose: bool = True,
+    detailed: bool = True,
+    attention_mask: Optional[np.ndarray] = None,
+) -> LogitsComparisonResult:
+    """
+    Compare logits between HF and Levanter.
+
+    Args:
+        hf_logits: HF model logits (seq_len, vocab_size)
+        lev_logits: Levanter model logits (seq_len, vocab_size)
+        input_ids: Token IDs to identify image token positions
+        image_token_id: Token ID for image placeholders
+        tolerance: Max mean diff for pass/fail determination
+        verbose: Print comparison results
+        detailed: If True, split by pre-image/image/post-image regions.
+                  If False, only compute overall diff for masked positions (faster).
+        attention_mask: Optional mask for valid positions (1=valid, 0=padding).
+                        Required when detailed=False to exclude padding.
+
+    Returns:
+        LogitsComparisonResult with comparison statistics
+    """
+    # Ensure same sequence length
+    seq_len = min(hf_logits.shape[0], lev_logits.shape[0])
+    hf_logits = hf_logits[:seq_len]
+    lev_logits = lev_logits[:seq_len]
+    input_ids = input_ids[:seq_len]
+    if attention_mask is not None:
+        attention_mask = attention_mask[:seq_len]
+        valid_mask = attention_mask.astype(bool)
+        valid_count = valid_mask.sum()
+        lev_logits_valid = lev_logits[valid_mask]
+    else:
+        valid_mask = np.ones(seq_len, dtype=bool)
+        valid_count = seq_len
+        lev_logits_valid = lev_logits
+    # Simple mode: just compute overall diff for valid positions
+    if not detailed:
+        if attention_mask is not None:
+            # Only compare valid (non-padding) positions
+            diff = np.abs(hf_logits - lev_logits_valid)
+            overall_mean_diff = float(np.mean(diff))
+            overall_max_diff = float(np.max(diff))
+            if verbose:
+                print(
+                    f"Overall ({valid_count} valid tokens): mean={overall_mean_diff:.6e}, max={overall_max_diff:.6e}"
+                )
+        else:
+            diff = np.abs(hf_logits - lev_logits)
+            overall_mean_diff = float(np.mean(diff))
+            overall_max_diff = float(np.max(diff))
+            if verbose:
+                print(f"Overall ({seq_len} tokens): mean={overall_mean_diff:.6e}, max={overall_max_diff:.6e}")
+
+        passed = overall_mean_diff < tolerance
+        if verbose:
+            print(f"{'PASS' if passed else 'FAIL'} (tol={tolerance})")
+
+        return LogitsComparisonResult(
+            overall_mean_diff=overall_mean_diff,
+            overall_max_diff=overall_max_diff,
+            passed=passed,
+        )
+
+    # Detailed mode: split by region
+    image_mask = input_ids == image_token_id
+    has_image = image_mask.any()
+
+    if has_image:
+        image_start = int(np.where(image_mask)[0][0])
+        num_image_tokens = int(image_mask.sum())
+        post_image_start = image_start + num_image_tokens
+    else:
+        image_start = seq_len
+        num_image_tokens = 0
+        post_image_start = seq_len
+
+    if verbose:
+        print(f"Image tokens: start={image_start}, count={num_image_tokens}")
+
+    # 1. Pre-image text
+    if image_start > 0:
+        diff = np.abs(hf_logits[:image_start] - lev_logits[:image_start])
+        pre_image_mean_diff = float(np.mean(diff))
+        pre_image_max_diff = float(np.max(diff))
+    else:
+        pre_image_mean_diff = 0.0
+        pre_image_max_diff = 0.0
+
+    # 2. Image tokens
+    if num_image_tokens > 0:
+        diff = np.abs(hf_logits[image_start:post_image_start] - lev_logits[image_start:post_image_start])
+        image_mean_diff = float(np.mean(diff))
+        image_max_diff = float(np.max(diff))
+    else:
+        image_mean_diff = 0.0
+        image_max_diff = 0.0
+
+    # 3. Post-image text
+    if post_image_start < seq_len:
+        diff = np.abs(hf_logits[post_image_start:] - lev_logits[post_image_start:])
+        post_image_mean_diff = float(np.mean(diff))
+        post_image_max_diff = float(np.max(diff))
+    else:
+        post_image_mean_diff = 0.0
+        post_image_max_diff = 0.0
+
+    # Overall
+    overall_mean_diff = float(np.mean(np.abs(hf_logits - lev_logits)))
+    overall_max_diff = float(np.max(np.abs(hf_logits - lev_logits)))
+
+    # Pass/fail per region
+    passed = pre_image_mean_diff < tolerance and image_mean_diff < tolerance and post_image_mean_diff < tolerance
+
+    if verbose:
+        print(f"Pre-image ({image_start}): mean={pre_image_mean_diff:.6e}, max={pre_image_max_diff:.6e}")
+        print(f"Image ({num_image_tokens}): mean={image_mean_diff:.6e}, max={image_max_diff:.6e}")
+        print(
+            f"Post-image ({seq_len - post_image_start}): mean={post_image_mean_diff:.6e}, max={post_image_max_diff:.6e}"
+        )
+        print(f"Overall: mean={overall_mean_diff:.6e}, max={overall_max_diff:.6e}")
+        print(f"{'PASS' if passed else 'FAIL'} (tol={tolerance})")
+
+    return LogitsComparisonResult(
+        overall_mean_diff=overall_mean_diff,
+        overall_max_diff=overall_max_diff,
+        passed=passed,
+        pre_image_mean_diff=pre_image_mean_diff,
+        pre_image_max_diff=pre_image_max_diff,
+        image_mean_diff=image_mean_diff,
+        image_max_diff=image_max_diff,
+        post_image_mean_diff=post_image_mean_diff,
+        post_image_max_diff=post_image_max_diff,
+        details={
+            "image_start": image_start,
+            "num_image_tokens": num_image_tokens,
+            "post_image_start": post_image_start,
+        },
+    )
+
+
+def verify_pixel_values_consistency(
+    hf_pixel_values: np.ndarray,
+    lev_pixel_values: np.ndarray,
+    grid_mask: np.ndarray,
+    rtol: float = 1e-5,
+    atol: float = 1e-5,
+) -> bool:
+    """Verify that HF and Levanter pixel values match for valid patches.
+
+    Args:
+        hf_pixel_values: HF pixel values (num_patches, C, H, W)
+        lev_pixel_values: Levanter pixel values (TOTAL_PATCHES, C, H, W)
+        grid_mask: Boolean mask for valid patches
+        rtol: Relative tolerance for comparison
+        atol: Absolute tolerance for comparison
+
+    Returns:
+        True if pixel values match within tolerance
+    """
+    actual_patches = get_actual_patches_from_grid_mask(grid_mask)
+
+    # Extract valid patches from Levanter output
+    lev_valid = lev_pixel_values[:actual_patches]
+
+    # Compare with HF output
+    return np.allclose(hf_pixel_values, lev_valid, rtol=rtol, atol=atol)
+
+
+@dataclass
+class LevJaxTensors:
+    """JAX/Haliax NamedArrays for Levanter model input.
+
+    This dataclass holds all the NamedArrays needed to run a Levanter VLM model,
+    created from LevProcessedData.
+    """
+
+    input_ids: Any  # NamedArray (Batch, Position)
+    pixel_values: Any  # NamedArray (Batch, NumPatches, Channels, Height, Width)
+    grid_mask: Any  # NamedArray (Batch, GridMask)
+    unpad_indices: Optional[Any] = None  # NamedArray (Batch, NumImageTokens) - None for multi-image
+    loss_mask: Any = None  # NamedArray (Batch, Position) - mask for loss computation
+    # Axes for reference
+    Batch: Any = None
+    Position: Any = None
+    NumPatches: Any = None
+    Channels: Any = None
+    Height: Any = None
+    Width: Any = None
+    GridMaskAxis: Any = None
+    NumImageTokens: Any = None
+
+
+def create_lev_jax_tensors(
+    lev_data: LevProcessedData,
+    batch_size: int = 1,
+) -> LevJaxTensors:
+    """Convert LevProcessedData to JAX/Haliax NamedArrays for Levanter model.
+
+    This function creates all the NamedArrays needed to run a Levanter VLM model
+    from the LevProcessedData output of prepare_test_data().
+
+    Args:
+        lev_data: Levanter processed data with numpy arrays
+        batch_size: Batch size (default 1 for single sample)
+
+    Returns:
+        LevJaxTensors with all NamedArrays ready for model forward pass
+
+    Example:
+        >>> test_pairs = prepare_test_data(parquet_path, sample_indices=[0])
+        >>> jax_tensors = create_lev_jax_tensors(test_pairs[0].lev)
+        >>> logits = lev_model(
+        ...     jax_tensors.input_ids,
+        ...     pixel_values=jax_tensors.pixel_values,
+        ...     grid_mask=jax_tensors.grid_mask,
+        ...     unpad_indices=jax_tensors.unpad_indices,
+        ... )
+    """
+    import jax.numpy as jnp
+    import haliax as hax
+    from haliax import Axis
+
+    seq_len = len(lev_data.input_ids)
+
+    # Define axes
+    Batch = Axis("batch", batch_size)
+    Position = Axis("position", seq_len)
+
+    # Create input_ids tensor - replicate single sample to batch_size
+    input_ids_single = jnp.array(lev_data.input_ids, dtype=jnp.int32).reshape(1, -1)
+    input_ids_batched = jnp.tile(input_ids_single, (batch_size, 1))
+    input_ids = hax.named(input_ids_batched, (Batch, Position))
+
+    # Pixel values - already padded by BatchImageProcessor
+    total_patches = lev_data.pixel_values.shape[0]
+    channels = lev_data.pixel_values.shape[1]
+    height = lev_data.pixel_values.shape[2]
+    width = lev_data.pixel_values.shape[3]
+
+    NumPatches = Axis("num_patches", total_patches)
+    Channels = Axis("channels", channels)
+    Height = Axis("height", height)
+    Width = Axis("width", width)
+    GridMaskAxis = Axis("grid_mask", total_patches)
+
+    # Pixel values - replicate to batch_size
+    pv_single = jnp.array(lev_data.pixel_values, dtype=jnp.float32).reshape(1, total_patches, channels, height, width)
+    pv_batched = jnp.tile(pv_single, (batch_size, 1, 1, 1, 1))
+    pixel_values = hax.named(pv_batched, (Batch, NumPatches, Channels, Height, Width))
+
+    # Grid mask - replicate to batch_size
+    gm_single = jnp.array(lev_data.grid_mask).reshape(1, -1)
+    gm_batched = jnp.tile(gm_single, (batch_size, 1))
+    grid_mask = hax.named(gm_batched, (Batch, GridMaskAxis))
+
+    # Unpad indices - replicate to batch_size (None for multi-image case)
+    if lev_data.unpad_indices is not None:
+        num_image_tokens = lev_data.unpad_indices.shape[0]
+        NumImageTokens = Axis("num_image_tokens", num_image_tokens)
+        ui_single = jnp.array(lev_data.unpad_indices, dtype=jnp.int32).reshape(1, -1)
+        ui_batched = jnp.tile(ui_single, (batch_size, 1))
+        unpad_indices = hax.named(ui_batched, (Batch, NumImageTokens))
+    else:
+        # Multi-image case: no unpad_indices needed
+        unpad_indices = None
+        NumImageTokens = None
+
+    # Loss mask - replicate to batch_size
+    loss_mask_single = jnp.array(lev_data.loss_mask, dtype=jnp.float32).reshape(1, -1)
+    loss_mask_batched = jnp.tile(loss_mask_single, (batch_size, 1))
+    loss_mask = hax.named(loss_mask_batched, (Batch, Position))
+
+    return LevJaxTensors(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        grid_mask=grid_mask,
+        unpad_indices=unpad_indices,
+        loss_mask=loss_mask,
+        Batch=Batch,
+        Position=Position,
+        NumPatches=NumPatches,
+        Channels=Channels,
+        Height=Height,
+        Width=Width,
+        GridMaskAxis=GridMaskAxis,
+        NumImageTokens=NumImageTokens,
+    )
diff --git a/lib/levanter/tests/test_llava_onevision.py b/lib/levanter/tests/test_llava_onevision.py
new file mode 100644
index 0000000000..0b0e038ba6
--- /dev/null
+++ b/lib/levanter/tests/test_llava_onevision.py
@@ -0,0 +1,1511 @@
+# Test file for LLaVA OneVision model
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+import os
+import sys
+import tempfile
+
+import equinox as eqx
+import numpy as np
+import pytest
+import torch
+import jax
+import jax.numpy as jnp
+import jax.tree_util as jtu
+from jax import random
+from jax.sharding import Mesh
+
+import haliax as hax
+from haliax import Axis
+from haliax.partitioning import ResourceAxis
+
+from tokenizers import Tokenizer
+from tokenizers.models import WordLevel
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    LlavaOnevisionConfig as HfLlavaOnevisionConfig,
+    LlavaOnevisionForConditionalGeneration as HfLlavaOnevision,
+    LlavaOnevisionProcessor,
+    PreTrainedTokenizerFast,
+    Qwen2Config as HfQwen2Config,
+    SiglipVisionConfig as HfSiglipVisionConfig,
+)
+from transformers.models.llava_onevision.modeling_llava_onevision import (
+    image_size_to_num_patches as hf_image_size_to_num_patches,
+)
+
+from levanter.compat.hf_checkpoints import from_torch_compatible_state_dict
+from levanter.data.image import create_custom_processor
+from levanter.inference.engine import InferenceEngineConfig
+from levanter.inference.jit_scheduler import SeqDecodingParams
+from levanter.models.llava_onevision import (
+    LlavaInferenceEngine,
+    LlavaOnevisionConfig,
+    LlavaOnevisionModel,
+    VLMRequest,
+)
+from levanter.models.qwen import QwenConfig
+from levanter.models.siglip import SiglipVisionConfig
+from levanter.trainer import TrainerConfig
+from levanter.utils.mesh import DEFAULT_DP_AXES, MeshConfig
+
+# Import test utils for mesh context
+sys.path.insert(0, os.path.dirname(__file__))
+from test_utils import skip_if_no_torch, use_test_mesh
+
+# Import shared helper functions from test_image_utils
+from test_image_utils import (
+    DEFAULT_GRID_PINPOINTS,
+    compare_logits_by_region,
+    create_grid_mask,
+    create_lev_jax_tensors,
+    get_multi_images,
+    get_single_image,
+    pad_pixel_values,
+    prepare_test_data_single,
+)
+
+
+def _to_float32(x):
+    """Convert JAX arrays to float32 for numerical consistency in tests."""
+    if isinstance(x, jnp.ndarray) and jnp.issubdtype(x.dtype, jnp.floating):
+        return x.astype(jnp.float32)
+    return x
+
+
+def _tiny_vision_config():
+    """Return a tiny SiglipVisionConfig for testing."""
+    return SiglipVisionConfig(
+        hidden_size=64,
+        intermediate_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=128,
+        patch_size=16,
+    )
+
+
+def _tiny_text_config():
+    """Return a tiny QwenConfig for testing."""
+    return QwenConfig(
+        max_seq_len=256,
+        hidden_dim=128,
+        intermediate_dim=512,
+        num_layers=2,
+        num_heads=4,
+        num_kv_heads=2,
+    )
+
+
+def _tiny_llava_onevision_config():
+    """Return a tiny LlavaOnevisionConfig for testing."""
+    return LlavaOnevisionConfig(
+        vision_config=_tiny_vision_config(),
+        text_config=_tiny_text_config(),
+        image_token_index=151646,
+        video_token_index=151647,
+    )
+
+
+@skip_if_no_torch
+def _hf_llava_onevision_config():
+    """Return a HuggingFace LlavaOnevisionConfig for testing."""
+    vision_config = HfSiglipVisionConfig(
+        hidden_size=64,
+        intermediate_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        patch_size=16,
+        image_size=128,
+    )
+
+    text_config = HfQwen2Config(
+        hidden_size=128,
+        intermediate_size=512,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        max_position_embeddings=256,
+        vocab_size=151936,
+        no_bias=True,
+    )
+
+    return HfLlavaOnevisionConfig(
+        vision_config=vision_config.to_dict(),
+        text_config=text_config.to_dict(),
+        image_token_index=151646,
+        video_token_index=151647,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="full",
+        vision_feature_layer=-1,
+        vision_aspect_ratio="anyres_max_9",
+        image_grid_pinpoints=[[128, 128]],
+        multimodal_projector_bias=True,
+    )
+
+
+# =====================
+# Config Tests
+# =====================
+
+
+def test_llava_onevision_config_vision_feature_strategy_validation():
+    """Test that invalid vision_feature_select_strategy raises an error."""
+    with pytest.raises(ValueError, match="vision_feature_select_strategy must be"):
+        LlavaOnevisionConfig(
+            vision_config=_tiny_vision_config(),
+            text_config=_tiny_text_config(),
+            vision_feature_select_strategy="invalid_strategy",
+        )
+
+
+@skip_if_no_torch
+def test_llava_onevision_from_hf_config():
+    """Test conversion from HuggingFace config to Levanter config."""
+    hf_config = _hf_llava_onevision_config()
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    assert config.image_token_index == hf_config.image_token_index
+    assert config.video_token_index == hf_config.video_token_index
+    assert config.vision_feature_select_strategy == hf_config.vision_feature_select_strategy
+    assert config.vision_feature_layer == hf_config.vision_feature_layer
+    assert config.vision_aspect_ratio == hf_config.vision_aspect_ratio
+    assert config.multimodal_projector_bias == hf_config.multimodal_projector_bias
+    assert config.vision_config.hidden_size == 64
+    assert config.text_config.hidden_dim == 128
+
+
+@skip_if_no_torch
+def test_llava_onevision_config_roundtrip():
+    """Test that converting HF -> Levanter -> HF preserves the config."""
+    hf_config_orig = _hf_llava_onevision_config()
+    levanter_config = LlavaOnevisionConfig.from_hf_config(hf_config_orig)
+    hf_config_roundtrip = levanter_config.to_hf_config(vocab_size=151936)
+
+    assert hf_config_roundtrip.image_token_index == hf_config_orig.image_token_index
+    assert hf_config_roundtrip.video_token_index == hf_config_orig.video_token_index
+    assert hf_config_roundtrip.projector_hidden_act == hf_config_orig.projector_hidden_act
+    assert hf_config_roundtrip.vision_feature_select_strategy == hf_config_orig.vision_feature_select_strategy
+    assert hf_config_roundtrip.vision_feature_layer == hf_config_orig.vision_feature_layer
+    assert hf_config_roundtrip.vision_aspect_ratio == hf_config_orig.vision_aspect_ratio
+    assert hf_config_roundtrip.multimodal_projector_bias == hf_config_orig.multimodal_projector_bias
+
+
+# =====================
+# Error Case Tests
+# =====================
+
+
+def test_llava_onevision_get_placeholder_mask_count_mismatch():
+    """Test that placeholder mask raises error when token count doesn't match feature count."""
+    config = _tiny_llava_onevision_config()
+    Vocab = Axis("vocab", 1000)
+
+    model = LlavaOnevisionModel.init(
+        Vocab=Vocab,
+        config=config,
+        key=random.PRNGKey(42),
+    )
+
+    Batch = Axis("batch", 1)
+    SeqLen = Axis("position", 16)
+
+    # Create input with 3 image tokens
+    input_ids_array = jnp.full((Batch.size, SeqLen.size), 100, dtype=jnp.int32)
+    input_ids_array = input_ids_array.at[0, 3:6].set(config.image_token_index)
+    input_ids = hax.named(input_ids_array, (Batch, SeqLen))
+
+    # Create image features with wrong count (5 instead of 3)
+    TotalPatches = Axis("total_patches", 5)
+    image_features = hax.random.normal(random.PRNGKey(0), (TotalPatches, config.TextEmbed))
+
+    with pytest.raises(ValueError, match="Image features and image tokens do not match"):
+        model.validate_placeholder_mask(input_ids, image_features)
+
+
+# =====================
+# HF Comparison Tests
+# =====================
+
+
+@skip_if_no_torch
+def test_llava_onevision_multimodal_projector_vs_hf():
+    """Compare multimodal projector output with HuggingFace."""
+    hf_config = _hf_llava_onevision_config()
+    torch.random.manual_seed(0)
+    torch_model = HfLlavaOnevision(hf_config)
+    torch_model.eval()
+
+    hf_projector = torch_model.model.multi_modal_projector
+
+    batch_size = 2
+    num_patches = 16
+    vision_hidden_size = hf_config.vision_config.hidden_size
+
+    vision_features_torch = torch.randn(batch_size, num_patches, vision_hidden_size)
+
+    with torch.no_grad():
+        hf_output = hf_projector(vision_features_torch)
+        hf_output_np = hf_output.detach().cpu().numpy()
+
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with tempfile.TemporaryDirectory() as tmpdir, use_test_mesh(mesh=single_device_mesh):
+        torch_model.save_pretrained(f"{tmpdir}/torch_model")
+
+        tokenizer = PreTrainedTokenizerFast(
+            tokenizer_object=Tokenizer(
+                WordLevel(
+                    {"<unk>": 0, "<pad>": 1, "<bos>": 2, "<eos>": 3},
+                    unk_token="<unk>",
+                )
+            ),
+            unk_token="<unk>",
+            pad_token="<pad>",
+            bos_token="<bos>",
+            eos_token="<eos>",
+        )
+        tokenizer.save_pretrained(f"{tmpdir}/torch_model")
+
+        Vocab = Axis("vocab", hf_config.text_config.vocab_size)
+        model_template = eqx.filter_eval_shape(LlavaOnevisionModel.init, Vocab, config, key=random.PRNGKey(0))
+
+        converter = config.hf_checkpoint_converter(ref_checkpoint=f"{tmpdir}/torch_model")
+        state_dict = converter.load_state_dict(f"{tmpdir}/torch_model")
+        model = from_torch_compatible_state_dict(model_template, state_dict)
+
+        lev_projector = model.multi_modal_projector
+
+    Batch = Axis("batch", batch_size)
+    NumPatches = Axis("num_patches", num_patches)
+    VisionEmbed = Axis("embed", vision_hidden_size)
+
+    vision_features = hax.named(
+        jnp.array(vision_features_torch.numpy().astype(np.float32), dtype=jnp.float32),
+        (Batch, NumPatches, VisionEmbed),
+    )
+
+    @hax.named_jit
+    def compute_projector(projector, features):
+        return projector(features, key=None)
+
+    lev_output = compute_projector(lev_projector, vision_features).array
+
+    max_diff = np.max(np.abs(hf_output_np - np.array(lev_output)))
+    # Single layer comparison: use 1e-4 tolerance
+    assert np.allclose(
+        hf_output_np, np.array(lev_output), rtol=1e-4, atol=3e-4
+    ), f"Multimodal Projector mismatch: max diff = {max_diff}"
+
+
+@skip_if_no_torch
+def test_llava_onevision_full_model_vs_hf():
+    """Test LLaVA OneVision full model forward pass matches HuggingFace."""
+    # Force float32 precision for accurate comparison with PyTorch
+    # TPU default uses bfloat16 which causes ~0.01 numerical differences
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    hf_config = _hf_llava_onevision_config()
+    torch.random.manual_seed(0)
+    torch_model = HfLlavaOnevision(hf_config)
+    torch_model.eval()
+    torch_model.model.image_newline = None
+
+    batch_size = 1
+    image_height = hf_config.vision_config.image_size
+    image_width = hf_config.vision_config.image_size
+    num_channels = hf_config.vision_config.num_channels
+
+    pixel_values_4d = torch.randn(batch_size, num_channels, image_height, image_width)
+    num_patches_anyres = hf_image_size_to_num_patches(
+        [image_height, image_width], hf_config.image_grid_pinpoints, hf_config.vision_config.image_size
+    )
+    pixel_values_5d = pixel_values_4d.unsqueeze(1).expand(-1, num_patches_anyres, -1, -1, -1).contiguous()
+
+    with torch.no_grad():
+        hf_image_features_list = torch_model.model.get_image_features(
+            pixel_values=pixel_values_5d, image_sizes=torch.tensor([[image_height, image_width]])
+        )
+        hf_image_features_concat = torch.cat(hf_image_features_list, dim=0)
+        num_image_tokens_full = hf_image_features_concat.shape[0]
+
+    seq_len = 5 + num_image_tokens_full + 5
+    input_ids_torch = torch.randint(0, 1000, (batch_size, seq_len), dtype=torch.long)
+    input_ids_torch[0, 5 : 5 + num_image_tokens_full] = hf_config.image_token_index
+
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with tempfile.TemporaryDirectory() as tmpdir, use_test_mesh(mesh=single_device_mesh):
+        torch_model.save_pretrained(f"{tmpdir}/torch_model")
+
+        tokenizer = PreTrainedTokenizerFast(
+            tokenizer_object=Tokenizer(
+                WordLevel(
+                    {"<unk>": 0, "<pad>": 1, "<bos>": 2, "<eos>": 3},
+                    unk_token="<unk>",
+                )
+            ),
+            unk_token="<unk>",
+            pad_token="<pad>",
+            bos_token="<bos>",
+            eos_token="<eos>",
+        )
+        tokenizer.save_pretrained(f"{tmpdir}/torch_model")
+
+        Vocab = Axis("vocab", hf_config.text_config.vocab_size)
+        model_template = eqx.filter_eval_shape(LlavaOnevisionModel.init, Vocab, config, key=random.PRNGKey(0))
+
+        converter = config.hf_checkpoint_converter(ref_checkpoint=f"{tmpdir}/torch_model")
+        state_dict = converter.load_state_dict(f"{tmpdir}/torch_model")
+        model = from_torch_compatible_state_dict(model_template, state_dict)
+
+    # Test patch embeddings
+    with torch.no_grad():
+        hf_patch_embed = torch_model.model.vision_tower.vision_model.embeddings(pixel_values_4d)
+        hf_patch_embed_np = hf_patch_embed.detach().cpu().numpy()
+
+    Batch = Axis("batch", batch_size)
+    Channels = Axis("channels", num_channels)
+    Height = Axis("height", image_height)
+    Width = Axis("width", image_width)
+
+    pixel_values_lev = hax.named(
+        jnp.array(pixel_values_4d.numpy().astype(np.float32), dtype=jnp.float32), (Batch, Channels, Height, Width)
+    )
+
+    @hax.named_jit
+    def compute_patch_embed(vision_tower, pixel_values):
+        return vision_tower.vision_model.embeddings(pixel_values, key=None)
+
+    lev_patch_embed = compute_patch_embed(model.vision_tower, pixel_values_lev).array
+
+    max_diff = np.max(np.abs(hf_patch_embed_np - np.array(lev_patch_embed)))
+    # With float32 precision, patch embedding should match closely
+    assert np.allclose(
+        hf_patch_embed_np, np.array(lev_patch_embed), rtol=1e-4, atol=1e-4
+    ), f"Patch embedding mismatch: max diff = {max_diff}"
+
+    # Test multimodal forward
+    image_sizes_full = torch.tensor([[image_height, image_width]] * batch_size, dtype=torch.long)
+
+    with torch.no_grad():
+        hf_output = torch_model(
+            input_ids=input_ids_torch,
+            pixel_values=pixel_values_5d,
+            image_sizes=image_sizes_full,
+            attention_mask=torch.ones_like(input_ids_torch),
+            return_dict=True,
+        )
+        hf_multimodal_logits = hf_output.logits.detach().cpu().numpy()
+
+    actual_patches = num_patches_anyres
+    total_patches = 10
+    grid_mask_np = create_grid_mask(actual_patches, total_patches)
+    pv_array = pixel_values_5d.numpy().astype(np.float32)
+    pv_padded = pad_pixel_values(pv_array[0], total_patches)
+    pv_padded = np.expand_dims(pv_padded, 0)
+
+    NumPatchesPadded = Axis("num_patches", total_patches)
+    pixel_values_lev_padded = hax.named(
+        jnp.array(pv_padded, dtype=jnp.float32),
+        (Batch, NumPatchesPadded, Channels, Height, Width),
+    )
+
+    GridMaskAxis = Axis("num_patches", total_patches)
+    grid_mask = hax.named(
+        jnp.array(np.expand_dims(grid_mask_np, 0)),
+        (Batch, GridMaskAxis),
+    )
+
+    NumImageTokens = Axis("num_image_tokens", num_image_tokens_full)
+    unpad_indices_np = np.arange(num_image_tokens_full, dtype=np.int32)
+    unpad_indices = hax.named(
+        jnp.array(np.expand_dims(unpad_indices_np, 0)),
+        (Batch, NumImageTokens),
+    )
+
+    PositionFull = Axis("position", seq_len)
+    input_ids_multimodal_lev = hax.named(jnp.array(input_ids_torch.numpy(), dtype=jnp.int32), (Batch, PositionFull))
+
+    def compute_multimodal(model, input_ids, pixel_values, grid_mask, unpad_indices):
+        return model(
+            input_ids,
+            pixel_values=pixel_values,
+            grid_mask=grid_mask,
+            unpad_indices=unpad_indices,
+            key=None,
+        )
+
+    lev_multimodal_logits = compute_multimodal(
+        model, input_ids_multimodal_lev, pixel_values_lev_padded, grid_mask, unpad_indices
+    ).array
+
+    max_diff = np.max(np.abs(hf_multimodal_logits - np.array(lev_multimodal_logits)))
+    # Multi-layer comparison: 1e-3
+    assert np.allclose(
+        hf_multimodal_logits, np.array(lev_multimodal_logits), rtol=1e-3, atol=1e-3
+    ), f"Multimodal forward pass mismatch: max diff = {max_diff}"
+
+
+@skip_if_no_torch
+def test_llava_onevision_visual_embeddings_match():
+    """Compare HF vs Levanter merged embeddings (text + visual) before LM."""
+    # Force float32 precision for accurate comparison with PyTorch
+    # TPU default uses bfloat16 which causes ~0.01 numerical differences
+    jax.config.update("jax_default_matmul_precision", "float32")
+    image = get_single_image()
+
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+    torch_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+    torch_model.model.image_newline = None
+    torch_model.eval()
+    torch_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+
+    processor_hf = create_custom_processor(model_name, do_pad=False, image_grid_pinpoints=DEFAULT_GRID_PINPOINTS)
+    processor_lev = create_custom_processor(model_name, do_pad=True, image_grid_pinpoints=DEFAULT_GRID_PINPOINTS)
+
+    text = "Describe this image briefly."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+    prompt = processor_hf.apply_chat_template(messages, add_generation_prompt=True)
+
+    inputs_hf = processor_hf(images=image, text=prompt, return_tensors="pt", padding_mode=False)
+    inputs_lev = processor_lev(images=image, text=prompt, return_tensors="pt")
+
+    with torch.no_grad():
+        hf_inputs_embeds = torch_model.model.get_input_embeddings()(inputs_hf["input_ids"])
+        hf_image_features_list = torch_model.model.get_image_features(
+            pixel_values=inputs_hf["pixel_values"],
+            image_sizes=inputs_hf["image_sizes"],
+            vision_feature_layer=torch_model.config.vision_feature_layer,
+            vision_feature_select_strategy=torch_model.config.vision_feature_select_strategy,
+        )
+        hf_image_features = torch.cat(hf_image_features_list, dim=0).to(
+            hf_inputs_embeds.device, hf_inputs_embeds.dtype
+        )
+
+    hf_config = torch_model.config
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    Vocab = Axis("vocab", hf_config.text_config.vocab_size)
+    model_template = eqx.filter_eval_shape(LlavaOnevisionModel.init, Vocab, config, key=random.PRNGKey(0))
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with use_test_mesh(mesh=single_device_mesh):
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        state_dict = converter.load_state_dict(model_name)
+        lev_model = from_torch_compatible_state_dict(model_template, state_dict)
+
+    lev_model = jtu.tree_map(_to_float32, lev_model)
+
+    batch_size = inputs_lev["input_ids"].shape[0]
+    Batch = Axis("batch", batch_size)
+
+    pixel_values_torch = inputs_lev["pixel_values"]
+    num_patches = pixel_values_torch.shape[1]
+    channels = pixel_values_torch.shape[2]
+    height = pixel_values_torch.shape[3]
+    width = pixel_values_torch.shape[4]
+
+    actual_patches = num_patches
+    patch_size = config.vision_config.image_size
+    max_resolution = max(max(h, w) for h, w in config.image_grid_pinpoints)
+    max_patches_per_dim = max_resolution // patch_size
+    total_patches = max_patches_per_dim * max_patches_per_dim + 1
+    grid_mask_np = create_grid_mask(actual_patches, total_patches)
+
+    pv_np = pixel_values_torch.numpy().astype(np.float32)
+    pv_padded_np = pad_pixel_values(pv_np[0], total_patches)
+    pv_padded_np = np.expand_dims(pv_padded_np, 0)
+
+    NumPatchesPadded = Axis("num_patches", total_patches)
+    Channels = Axis("channels", channels)
+    Height = Axis("height", height)
+    Width = Axis("width", width)
+    GridMaskAxis = Axis("grid_mask", total_patches)
+
+    pixel_values_lev = hax.named(
+        jnp.array(pv_padded_np, dtype=jnp.float32), (Batch, NumPatchesPadded, Channels, Height, Width)
+    )
+    grid_mask = hax.named(jnp.array(np.expand_dims(grid_mask_np, 0)), (Batch, GridMaskAxis))
+
+    @hax.named_jit
+    def compute_image_features(model, pixel_values, grid_mask):
+        return model.get_image_features(pixel_values=pixel_values, grid_mask=grid_mask, key=None)
+
+    image_features_result = compute_image_features(lev_model, pixel_values_lev, grid_mask)
+    if isinstance(image_features_result, tuple):
+        image_features_lev, _ = image_features_result
+    else:
+        image_features_lev = image_features_result
+
+    batch_ax, num_patches_ax, features_per_patch_ax, embed_ax = image_features_lev.axes
+
+    image_sizes = inputs_hf["image_sizes"].tolist()
+    num_hf_image_tokens = hf_image_features.shape[0]
+    unpad_indices_np = processor_lev.compute_unpad_indices(
+        image_sizes=image_sizes,
+        height=patch_size,
+        width=patch_size,
+        max_num_features=num_hf_image_tokens,
+    )
+    NumImageTokens = Axis("num_image_tokens", num_hf_image_tokens)
+    unpad_indices = hax.named(jnp.array(unpad_indices_np, dtype=jnp.int32), (Batch, NumImageTokens))
+
+    total_image_tokens = num_patches_ax.size * features_per_patch_ax.size
+    ImageTokens = Axis("image_tokens", total_image_tokens)
+    image_features_flat = hax.flatten_axes(image_features_lev, (num_patches_ax, features_per_patch_ax), ImageTokens)
+
+    def gather_unpadded(features, indices):
+        return features[indices]
+
+    image_features_reordered = jax.vmap(gather_unpadded)(image_features_flat.array, unpad_indices.array)
+
+    hf_raw_features = hf_image_features.cpu().numpy()
+    lev_raw_features = np.array(image_features_reordered[0])
+
+    overall_diff = np.mean(np.abs(hf_raw_features - lev_raw_features))
+    # Multi-layer comparison: 1e-3
+    assert overall_diff < 1e-3, f"Image features mismatch: overall_diff={overall_diff:.6e}"
+
+
+# =====================
+# Integration Tests
+# =====================
+
+
+@skip_if_no_torch
+def test_llava_onevision_real_image_text():
+    """Test with real image and text using processor with feature alignment.
+
+    This test uses the same feature alignment approach as test_llava_onevision_visual_embeddings_match
+    to properly compare logits between HF (unpadded) and Levanter (padded) models.
+    """
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    # Load real image
+    image = get_single_image()
+
+    # Use a small pretrained model for testing
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    try:
+        torch_model = HfLlavaOnevision.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,
+        )
+        torch_model.model.image_newline = None  # Disable image_newline for consistency
+        torch_model.eval()
+        torch_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+    except Exception:
+        pytest.skip(f"Could not download model: {model_name}")
+        return
+
+    # Prepare inputs with processor
+    text = "Describe this image in detail."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+
+    test_pair = prepare_test_data_single(
+        messages=messages,
+        images=[image],
+        model_name=model_name,
+        add_generation_prompt=True,
+    )
+
+    # Extract HF data for HF forward pass
+    hf_input_ids = torch.tensor(test_pair.hf.input_ids).unsqueeze(0)
+    hf_pixel_values = torch.tensor(test_pair.hf.pixel_values).unsqueeze(0)
+    hf_attention_mask = torch.tensor(test_pair.hf.attention_mask).unsqueeze(0)
+    hf_image_sizes = torch.tensor(test_pair.hf.image_sizes).unsqueeze(0)
+
+    inputs_hf = {
+        "input_ids": hf_input_ids,
+        "pixel_values": hf_pixel_values,
+        "attention_mask": hf_attention_mask,
+        "image_sizes": hf_image_sizes,
+    }
+
+    # HuggingFace forward pass
+    with torch.no_grad():
+        hf_output = torch_model(**inputs_hf)
+        hf_logits = hf_output.logits.detach().cpu().numpy()
+
+    # Convert to Levanter
+    hf_config = torch_model.config
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    mesh_config = MeshConfig(compute_mapping={"vision_batch": DEFAULT_DP_AXES})
+    trainer_config = TrainerConfig(mesh=mesh_config)
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=jnp.float32,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+    # Compute valid image token count using attention_mask & image_mask intersection
+    image_token_id = torch_model.config.image_token_index
+    image_mask = test_pair.lev.input_ids == image_token_id
+    valid_image_mask = test_pair.lev.attention_mask.astype(bool) & image_mask
+    num_valid_image_tokens = int(valid_image_mask.sum())
+
+    # Trim unpad_indices to actual count (remove padding zeros)
+    test_pair.lev.unpad_indices = test_pair.lev.unpad_indices[:num_valid_image_tokens]
+
+    # Create JAX tensors with batch_size=1
+    jax_tensors = create_lev_jax_tensors(test_pair.lev, batch_size=1)
+    input_ids_lev_tensor = jax_tensors.input_ids
+    pixel_values_lev_tensor = jax_tensors.pixel_values
+    grid_mask = jax_tensors.grid_mask
+    unpad_indices = jax_tensors.unpad_indices
+
+    @hax.named_jit
+    def compute_lev(model, input_ids, pixel_values, grid_mask, unpad_indices):
+        return model(
+            input_ids,
+            pixel_values=pixel_values,
+            grid_mask=grid_mask,
+            unpad_indices=unpad_indices,
+            key=None,
+        )
+
+    # Forward pass
+    lev_logits = compute_lev(lev_model, input_ids_lev_tensor, pixel_values_lev_tensor, grid_mask, unpad_indices)
+    lev_logits = lev_logits.array
+
+    # Compare logits
+    lev_logits_np = np.array(lev_logits)
+    if lev_logits_np.ndim == 3:
+        lev_logits_np = lev_logits_np[0]  # Remove batch dimension
+
+    hf_logits_flat = hf_logits[0]  # (seq_len, vocab_size)
+
+    # Note: tolerance=1.5e-3 accounts for cross-framework numerical differences
+    # between JAX and PyTorch, especially in SigLIP vision encoder attention.
+    comparison_result = compare_logits_by_region(
+        hf_logits=hf_logits_flat,
+        lev_logits=lev_logits_np,
+        input_ids=test_pair.hf.input_ids,
+        image_token_id=image_token_id,
+        tolerance=1.5e-3,
+        verbose=False,
+        detailed=False,
+        attention_mask=test_pair.lev.attention_mask,
+    )
+
+    assert comparison_result.passed, "Real image/text test failed"
+
+
+@skip_if_no_torch
+def test_llava_onevision_real_multi_image_text():
+    """Test Levanter model with multiple images, comparing HF and Levanter outputs.
+
+    This test validates multi-image behavior where:
+    - Both HF and Levanter use base patch per image (no anyres sub-patches)
+    - unpad_indices is None for multi-image case
+    - grid_mask marks which patches are valid (num_images base patches)
+    - HF processor generates correct image tokens with padding_mode=True
+    """
+    jax.config.update("jax_default_matmul_precision", "float32")
+    # Load multiple images
+    images = get_multi_images()  # Returns list of 2 images
+    num_images = len(images)
+
+    # Use a small pretrained model for testing
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    try:
+        torch_model = HfLlavaOnevision.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,
+        )
+        torch_model.model.image_newline = None  # Disable image_newline for consistency
+        torch_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+    except Exception:
+        pytest.skip(f"Could not download model: {model_name}")
+        return
+
+    # Prepare inputs with processor
+    text = "Compare these two images and describe the differences."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": text}]}]
+
+    test_pair = prepare_test_data_single(
+        messages=messages,
+        images=images,
+        model_name=model_name,
+        add_generation_prompt=True,
+        max_length=16384,  # Larger max_length for multi-image to avoid truncation
+    )
+
+    # Verify multi-image preprocessing is correct
+    assert test_pair.lev.unpad_indices is None, "Multi-image should have None unpad_indices"
+    assert (
+        test_pair.lev.grid_mask.sum() == num_images
+    ), f"Multi-image should have {num_images} valid patches (base only)"
+
+    # Prepare HF inputs for forward pass
+    hf_input_ids = torch.tensor(test_pair.hf.input_ids).unsqueeze(0)
+    hf_attention_mask = torch.tensor(test_pair.hf.attention_mask).unsqueeze(0)
+
+    hf_pixel_values = torch.tensor(test_pair.hf.pixel_values)
+    if hf_pixel_values.dim() == 4:
+        hf_pixel_values = hf_pixel_values.unsqueeze(0)
+
+    hf_image_sizes = torch.tensor(test_pair.hf.image_sizes)
+    if hf_image_sizes.dim() == 1:
+        hf_image_sizes = hf_image_sizes.unsqueeze(0)
+
+    # HuggingFace forward pass with batch_num_images for multi-image mode
+    with torch.no_grad():
+        hf_output = torch_model(
+            input_ids=hf_input_ids,
+            pixel_values=hf_pixel_values,
+            attention_mask=hf_attention_mask,
+            image_sizes=hf_image_sizes,
+            batch_num_images=torch.tensor([num_images]),
+        )
+        hf_logits = hf_output.logits.detach().cpu().numpy()
+
+    # Convert to Levanter
+    hf_config = torch_model.config
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    mesh_config = MeshConfig(
+        axes={"model": 8, "data": 1, "replica": 1},
+        compute_mapping={
+            "vision_batch": ("model",),
+            "vocab": "model",
+            "batch": ("replica_dcn", "replica"),
+        },
+        shared_mapping={
+            "heads": "data",
+            "mlp": "data",
+        },
+        param_mapping={
+            "heads": "data",
+        },
+    )
+    trainer_config = TrainerConfig(mesh=mesh_config)
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=jnp.float32,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        # Create JAX tensors
+        jax_tensors = create_lev_jax_tensors(test_pair.lev, batch_size=1)
+        input_ids_lev_tensor = jax_tensors.input_ids
+        pixel_values_lev_tensor = jax_tensors.pixel_values
+        grid_mask = jax_tensors.grid_mask
+        unpad_indices = jax_tensors.unpad_indices
+
+        assert unpad_indices is None, "Multi-image should have None unpad_indices in JAX tensors"
+
+        @hax.named_jit
+        def compute_lev(model, input_ids, pixel_values, grid_mask, unpad_indices):
+            return model(
+                input_ids,
+                pixel_values=pixel_values,
+                grid_mask=grid_mask,
+                unpad_indices=unpad_indices,
+                key=None,
+            )
+
+        lev_logits = compute_lev(lev_model, input_ids_lev_tensor, pixel_values_lev_tensor, grid_mask, unpad_indices)
+        lev_logits = lev_logits.array
+
+        # Verify logits are not NaN/Inf
+        assert not jnp.isnan(lev_logits).any(), "Logits contain NaN"
+        assert not jnp.isinf(lev_logits).any(), "Logits contain Inf"
+
+        # Compare logits
+        lev_logits_np = np.array(lev_logits)
+        if lev_logits_np.ndim == 3:
+            lev_logits_np = lev_logits_np[0]
+
+        hf_logits_flat = hf_logits[0]
+
+        image_token_id = torch_model.config.image_token_index
+        comparison_result = compare_logits_by_region(
+            hf_logits=hf_logits_flat,
+            lev_logits=lev_logits_np,
+            input_ids=test_pair.hf.input_ids,
+            image_token_id=image_token_id,
+            tolerance=1.5e-3,
+            verbose=True,
+            detailed=True,
+            attention_mask=test_pair.lev.attention_mask,
+        )
+
+        assert comparison_result.passed, "Multi-image test failed"
+
+
+@skip_if_no_torch
+def test_llava_onevision_real_image_text_0_5b_batch():
+    """Test with batch padding for better TPU utilization."""
+    jax.config.update("jax_default_matmul_precision", "float32")
+    image = get_single_image()
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    try:
+        hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        torch_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+        torch_model.model.image_newline = None
+        torch_model.eval()
+        torch_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+        processor_hf = create_custom_processor(model_name, do_pad=False, image_grid_pinpoints=DEFAULT_GRID_PINPOINTS)
+        processor_lev = create_custom_processor(model_name, do_pad=True, image_grid_pinpoints=DEFAULT_GRID_PINPOINTS)
+    except Exception:
+        pytest.skip(f"Could not download model: {model_name}")
+        return
+
+    text = "Describe this image in detail."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+
+    prompt = processor_hf.apply_chat_template(messages, add_generation_prompt=True)
+    inputs_hf = processor_hf(images=image, text=prompt, return_tensors="pt", padding_mode=False)
+    inputs_lev = processor_lev(
+        images=image, text=prompt, return_tensors="pt", padding="max_length", max_length=8192, padding_mode=True
+    )
+
+    with torch.no_grad():
+        hf_output = torch_model(**inputs_hf)
+        hf_logits = hf_output.logits.detach().cpu().numpy()
+
+    image_token_id = torch_model.config.image_token_index
+    input_ids_for_mask = inputs_hf["input_ids"].numpy()[0]
+    image_mask = input_ids_for_mask == image_token_id
+    num_image_tokens = image_mask.sum()
+
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    mesh_config = MeshConfig(
+        compute_mapping={
+            "vision_batch": DEFAULT_DP_AXES,
+        }
+    )
+    trainer_config = TrainerConfig(mesh=mesh_config)
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=jnp.float32,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        seq_len = inputs_lev["input_ids"].shape[1]
+        target_batch_size = 8
+
+        Batch = Axis("batch", target_batch_size)
+        Position = Axis("position", seq_len)
+
+        input_ids_np = inputs_lev["input_ids"].numpy()
+        input_ids_np = np.tile(input_ids_np, (target_batch_size, 1))
+        input_ids_lev = hax.named(jnp.array(input_ids_np, dtype=jnp.int32), (Batch, Position))
+
+        pixel_values_torch = inputs_lev["pixel_values"]
+        channels = pixel_values_torch.shape[2]
+        height = pixel_values_torch.shape[3]
+        width = pixel_values_torch.shape[4]
+
+        Channels = Axis("channels", channels)
+        Height = Axis("height", height)
+        Width = Axis("width", width)
+
+        actual_patches = pixel_values_torch.shape[1]
+        patch_size = config.vision_config.image_size
+        max_resolution = max(max(h, w) for h, w in config.image_grid_pinpoints)
+        max_patches_per_dim = max_resolution // patch_size
+        total_patches = max_patches_per_dim * max_patches_per_dim + 1
+        grid_mask_np = create_grid_mask(actual_patches, total_patches)
+
+        pv_np = pixel_values_torch.numpy().astype(np.float32)
+        pv_padded_single = pad_pixel_values(pv_np[0], total_patches)
+        pv_padded_np = np.tile(np.expand_dims(pv_padded_single, 0), (target_batch_size, 1, 1, 1, 1))
+
+        NumPatchesPadded = Axis("num_patches", total_patches)
+        GridMaskAxis = Axis("grid_mask", total_patches)
+        pixel_values_lev = hax.named(
+            jnp.array(pv_padded_np, dtype=jnp.float32),
+            (Batch, NumPatchesPadded, Channels, Height, Width),
+        )
+        grid_mask_tiled = np.tile(np.expand_dims(grid_mask_np, 0), (target_batch_size, 1))
+        grid_mask = hax.named(jnp.array(grid_mask_tiled), (Batch, GridMaskAxis))
+
+        image_sizes = inputs_lev["image_sizes"].tolist()
+        unpad_indices_np = processor_lev.compute_unpad_indices(
+            image_sizes=image_sizes,
+            height=patch_size,
+            width=patch_size,
+            max_num_features=num_image_tokens,
+        )
+        unpad_indices_np = np.tile(unpad_indices_np, (target_batch_size, 1))
+        NumImageTokens = Axis("num_image_tokens", num_image_tokens)
+        unpad_indices = hax.named(jnp.array(unpad_indices_np, dtype=jnp.int32), (Batch, NumImageTokens))
+
+        @hax.named_jit
+        def compute_lev(model, input_ids, pixel_values, grid_mask, unpad_indices):
+            return model(
+                input_ids,
+                pixel_values=pixel_values,
+                grid_mask=grid_mask,
+                unpad_indices=unpad_indices,
+                key=None,
+            )
+
+        lev_logits = compute_lev(lev_model, input_ids_lev, pixel_values_lev, grid_mask, unpad_indices)
+        lev_logits.array.block_until_ready()
+
+        lev_logits_np = np.array(lev_logits.array[0])
+        hf_logits_flat = hf_logits[0]
+
+        tolerance = 1.5e-3
+        attention_mask_np = inputs_lev["attention_mask"].numpy()[0]
+        result = compare_logits_by_region(
+            hf_logits=hf_logits_flat,
+            lev_logits=lev_logits_np,
+            input_ids=input_ids_for_mask,
+            image_token_id=image_token_id,
+            tolerance=tolerance,
+            verbose=False,
+            detailed=False,
+            attention_mask=attention_mask_np,
+        )
+
+        assert result.passed, "Batch test failed"
+
+
+# =====================
+# Image Feature Tests
+# =====================
+
+
+@skip_if_no_torch
+def test_get_image_features_vs_hf_real_single_image():
+    """Compare raw image features with HF using a real single image."""
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+    jax.config.update("jax_default_matmul_precision", "float32")
+    try:
+        torch_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+        torch_model.eval()
+        hf_config = torch_model.config
+        processor = LlavaOnevisionProcessor.from_pretrained(model_name)
+    except Exception as e:
+        pytest.skip(f"Could not download model: {model_name}, error: {e}")
+        return
+
+    image = get_single_image()
+
+    inputs = processor(text="Describe this image.", images=image, return_tensors="pt")
+    pixel_values_torch = inputs["pixel_values"]
+
+    batch_size = pixel_values_torch.shape[0]
+    num_patches = pixel_values_torch.shape[1]
+    channels = pixel_values_torch.shape[2]
+    patch_height = pixel_values_torch.shape[3]
+    patch_width = pixel_values_torch.shape[4]
+
+    pixel_values_flat = pixel_values_torch.reshape(-1, channels, patch_height, patch_width)
+
+    with torch.no_grad():
+        hf_vision_outputs = torch_model.model.vision_tower(pixel_values_flat, output_hidden_states=True)
+
+        vision_feature_layer = hf_config.vision_feature_layer
+        if isinstance(vision_feature_layer, int):
+            selected_hf_feature = hf_vision_outputs.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [hf_vision_outputs.hidden_states[idx] for idx in vision_feature_layer]
+            selected_hf_feature = torch.cat(hs_pool, dim=-1)
+
+        if hf_config.vision_feature_select_strategy == "default":
+            selected_hf_feature = selected_hf_feature[:, 1:]
+
+        hf_raw_features = torch_model.model.multi_modal_projector(selected_hf_feature)
+
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    Vocab = Axis("vocab", hf_config.text_config.vocab_size)
+    model_template = eqx.filter_eval_shape(LlavaOnevisionModel.init, Vocab, config, key=random.PRNGKey(0))
+
+    mesh_config = MeshConfig(
+        axes={"model": 8, "data": 1, "replica": 1},
+        compute_mapping={
+            "vision_batch": ("model",),
+            "vocab": "model",
+            "batch": ("replica_dcn", "replica"),
+        },
+        shared_mapping={
+            "heads": "data",
+            "mlp": "data",
+        },
+        param_mapping={
+            "heads": "data",
+        },
+    )
+    trainer_config = TrainerConfig(mesh=mesh_config)
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        state_dict = converter.load_state_dict(model_name)
+        lev_model = from_torch_compatible_state_dict(model_template, state_dict)
+
+    lev_model = jtu.tree_map(_to_float32, lev_model)
+
+    pv_np = pixel_values_torch.numpy().astype(np.float32)
+    grid_mask_np = np.ones((batch_size, num_patches), dtype=bool)
+
+    Batch = Axis("batch", batch_size)
+    NumPatches = Axis("num_patches", num_patches)
+    Channels = Axis("channels", channels)
+    Height = Axis("height", patch_height)
+    Width = Axis("width", patch_width)
+
+    pixel_values_lev = hax.named(jnp.array(pv_np, dtype=jnp.float32), (Batch, NumPatches, Channels, Height, Width))
+    grid_mask = hax.named(jnp.array(grid_mask_np), (Batch, NumPatches))
+
+    @hax.named_jit
+    def compute_lev_single(model, pixel_values, grid_mask):
+        return model.get_image_features(pixel_values=pixel_values, grid_mask=grid_mask, key=None)
+
+    lev_result = compute_lev_single(lev_model, pixel_values_lev, grid_mask)
+    lev_image_features = lev_result[0] if isinstance(lev_result, tuple) else lev_result
+
+    hf_array = hf_raw_features.detach().numpy()
+    lev_array = np.array(lev_image_features.array)
+
+    hf_array_reshaped = hf_array.reshape(batch_size, num_patches, -1, hf_array.shape[-1])
+
+    assert (
+        hf_array_reshaped.shape == lev_array.shape
+    ), f"Shape mismatch: HF={hf_array_reshaped.shape}, Lev={lev_array.shape}"
+
+    mean_diff = np.mean(np.abs(hf_array_reshaped - lev_array))
+    # Multi-layer comparison: 1e-3
+    assert mean_diff < 1e-3, f"Values don't match: mean diff = {mean_diff}"
+
+
+@skip_if_no_torch
+def test_get_image_features_vs_hf_real_multi_image():
+    """Compare raw image features with HF using real multiple images."""
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+
+    try:
+        torch_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+        torch_model.eval()
+        hf_config = torch_model.config
+        processor = LlavaOnevisionProcessor.from_pretrained(model_name)
+    except Exception as e:
+        pytest.skip(f"Could not download model: {model_name}, error: {e}")
+        return
+
+    image = get_single_image()
+    images = [image, image, image]
+
+    inputs = processor(text="Describe these images.", images=images, return_tensors="pt")
+    pixel_values_torch = inputs["pixel_values"]
+
+    batch_size = pixel_values_torch.shape[0]
+    num_patches = pixel_values_torch.shape[1]
+    channels = pixel_values_torch.shape[2]
+    patch_height = pixel_values_torch.shape[3]
+    patch_width = pixel_values_torch.shape[4]
+
+    pixel_values_flat = pixel_values_torch.reshape(-1, channels, patch_height, patch_width)
+
+    with torch.no_grad():
+        hf_vision_outputs = torch_model.model.vision_tower(pixel_values_flat, output_hidden_states=True)
+
+        vision_feature_layer = hf_config.vision_feature_layer
+        if isinstance(vision_feature_layer, int):
+            selected_hf_feature = hf_vision_outputs.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [hf_vision_outputs.hidden_states[idx] for idx in vision_feature_layer]
+            selected_hf_feature = torch.cat(hs_pool, dim=-1)
+
+        if hf_config.vision_feature_select_strategy == "default":
+            selected_hf_feature = selected_hf_feature[:, 1:]
+
+        hf_raw_features = torch_model.model.multi_modal_projector(selected_hf_feature)
+
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    mesh_config = MeshConfig(
+        compute_mapping={
+            "vision_batch": DEFAULT_DP_AXES,
+        }
+    )
+    trainer_config = TrainerConfig(mesh=mesh_config)
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=jnp.float32,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        original_batch_size = batch_size
+        target_batch_size = 8
+
+        pv_np = pixel_values_torch.numpy().astype(np.float32)
+        pv_padded = np.tile(pv_np, (target_batch_size // original_batch_size + 1, 1, 1, 1, 1))[:target_batch_size]
+        grid_mask_np = np.ones((target_batch_size, num_patches), dtype=bool)
+
+        Batch = Axis("batch", target_batch_size)
+        NumPatches = Axis("num_patches", num_patches)
+        Channels = Axis("channels", channels)
+        Height = Axis("height", patch_height)
+        Width = Axis("width", patch_width)
+
+        pixel_values_lev = hax.named(
+            jnp.array(pv_padded, dtype=jnp.float32), (Batch, NumPatches, Channels, Height, Width)
+        )
+        grid_mask = hax.named(jnp.array(grid_mask_np), (Batch, NumPatches))
+
+        @hax.named_jit
+        def compute_lev_multi(model, pixel_values, grid_mask):
+            return model.get_image_features(pixel_values=pixel_values, grid_mask=grid_mask, key=None)
+
+        lev_result = compute_lev_multi(lev_model, pixel_values_lev, grid_mask)
+        lev_image_features = lev_result[0] if isinstance(lev_result, tuple) else lev_result
+
+        hf_array = hf_raw_features.detach().numpy()
+        lev_array = np.array(lev_image_features.array)[:original_batch_size]
+
+        hf_array_reshaped = hf_array.reshape(original_batch_size, num_patches, -1, hf_array.shape[-1])
+
+        assert (
+            hf_array_reshaped.shape == lev_array.shape
+        ), f"Shape mismatch: HF={hf_array_reshaped.shape}, Lev={lev_array.shape}"
+
+        mean_diff = np.mean(np.abs(hf_array_reshaped - lev_array))
+        # Multi-layer comparison: 1e-3
+        assert mean_diff < 1e-3, f"Values don't match: mean diff = {mean_diff}"
+
+
+# =====================
+# Generation Engine Tests
+# =====================
+
+
+@pytest.mark.slow
+@skip_if_no_torch
+def test_llava_onevision_generation_with_inference_engine():
+    """Test generation using Levanter's LlavaInferenceEngine with VLMRequest (single image)."""
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+    image = get_single_image()
+
+    try:
+        torch_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+        torch_model.eval()
+        torch_model.model.image_newline = None
+        torch_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+        processor = AutoProcessor.from_pretrained(model_name)
+    except Exception:
+        pytest.skip(f"Could not download model: {model_name}")
+        return
+
+    text = "Describe the image in detail."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+
+    test_pair = prepare_test_data_single(
+        messages=messages,
+        images=[image],
+        model_name=model_name,
+        add_generation_prompt=True,
+    )
+
+    # HuggingFace generation
+    max_new_tokens = 100
+    hf_input_ids = torch.tensor(test_pair.hf.input_ids).unsqueeze(0)
+    hf_pixel_values = torch.tensor(test_pair.hf.pixel_values).unsqueeze(0)
+    hf_attention_mask = torch.tensor(test_pair.hf.attention_mask).unsqueeze(0)
+    hf_image_sizes = torch.tensor(test_pair.hf.image_sizes).unsqueeze(0)
+
+    with torch.no_grad():
+        hf_output_ids = torch_model.generate(
+            input_ids=hf_input_ids,
+            pixel_values=hf_pixel_values,
+            attention_mask=hf_attention_mask,
+            image_sizes=hf_image_sizes,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            pad_token_id=processor.tokenizer.pad_token_id,
+        )
+
+    prompt_len = hf_input_ids.shape[1]
+    hf_generated_ids = hf_output_ids[0, prompt_len:].cpu().numpy()
+
+    # Levanter generation
+    hf_config = torch_model.config
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    trainer_config = TrainerConfig()  # Default: model_axis_size=1, all devices on data axis
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        mesh = trainer_config.device_mesh
+        compute_dtype = jnp.float32
+        Vocab = Axis("vocab", hf_config.text_config.vocab_size)
+
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=compute_dtype,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        jax_tensors = create_lev_jax_tensors(test_pair.lev, batch_size=1)
+
+        # Configure InferenceEngine
+        prompt_len_lev = len(test_pair.hf.input_ids)
+        estimated_max_seq_len = prompt_len_lev + max_new_tokens + 64
+        page_size = 16
+
+        engine_config = InferenceEngineConfig(
+            max_seq_len=estimated_max_seq_len,
+            page_size=page_size,
+            max_seqs=1,
+            max_rounds=32,
+            max_stop_seqs=1,
+            max_stop_tokens=4,
+            max_pages=800,  # Reduced from 200 to avoid OOM
+            compute_dtype=compute_dtype,
+        )
+
+        engine = LlavaInferenceEngine.from_model_with_config(
+            model=lev_model,
+            tokenizer=processor.tokenizer,
+            config=engine_config,
+            Vocab=Vocab,
+            mesh=mesh,
+        )
+
+        prompt_tokens = test_pair.hf.input_ids.tolist()
+
+        eos_token_id = processor.tokenizer.eos_token_id
+        if eos_token_id is not None:
+            stop_tokens = hax.named(jnp.array([[eos_token_id]], dtype=jnp.int32), ("stop_seq", "position"))
+        else:
+            stop_tokens = None
+
+        decode_params = SeqDecodingParams(
+            max_num_tokens=estimated_max_seq_len,
+            temperature=0.0,
+            key=random.PRNGKey(42),
+            stop_tokens=stop_tokens,
+        )
+
+        vlm_request = VLMRequest(
+            prompt_tokens=prompt_tokens,
+            request_id=0,
+            decode_params=decode_params,
+            n_generations=1,
+            pixel_values=jax_tensors.pixel_values,
+            input_ids=jax_tensors.input_ids,
+            grid_mask=jax_tensors.grid_mask,
+            unpad_indices=jax_tensors.unpad_indices,
+        )
+
+        result = engine.generate([vlm_request])
+
+    lev_generated_ids = np.array(result.tokens[0])
+
+    # Compare results
+    min_len = min(len(hf_generated_ids), len(lev_generated_ids))
+    matching_tokens = sum(1 for i in range(min_len) if hf_generated_ids[i] == lev_generated_ids[i])
+    match_ratio = matching_tokens / min_len if min_len > 0 else 0
+
+    min_expected_tokens = len(hf_generated_ids) // 2
+    assert (
+        len(lev_generated_ids) >= min_expected_tokens
+    ), f"Levanter generated too few tokens: {len(lev_generated_ids)} < {min_expected_tokens}"
+    assert match_ratio >= 0.99, f"Token match ratio too low: {match_ratio:.1%}"
+
+
+@pytest.mark.slow
+@skip_if_no_torch
+def test_llava_onevision_generation_with_inference_engine_multi():
+    """Test generation using Levanter's LlavaInferenceEngine with VLMRequest (multi image)."""
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    model_name = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+    images = get_multi_images()
+    num_images = len(images)
+
+    try:
+        torch_model = HfLlavaOnevision.from_pretrained(model_name, torch_dtype=torch.float32)
+        torch_model.eval()
+        torch_model.model.image_newline = None
+        torch_model.model.config.image_grid_pinpoints = DEFAULT_GRID_PINPOINTS
+        processor = AutoProcessor.from_pretrained(model_name)
+    except Exception:
+        pytest.skip(f"Could not download model: {model_name}")
+        return
+
+    text = "Compare these two images and describe the differences."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": text}]}]
+
+    test_pair = prepare_test_data_single(
+        messages=messages,
+        images=images,
+        model_name=model_name,
+        add_generation_prompt=True,
+    )
+
+    assert test_pair.lev.unpad_indices is None, "Multi-image should have None unpad_indices"
+    assert test_pair.lev.grid_mask.sum() == num_images, f"Multi-image should have {num_images} valid patches"
+
+    # HuggingFace generation
+    max_new_tokens = 100
+    hf_input_ids = torch.tensor(test_pair.hf.input_ids).unsqueeze(0)
+    hf_attention_mask = torch.tensor(test_pair.hf.attention_mask).unsqueeze(0)
+    hf_pixel_values = torch.tensor(test_pair.hf.pixel_values)
+    if hf_pixel_values.dim() == 4:
+        hf_pixel_values = hf_pixel_values.unsqueeze(0)
+    hf_image_sizes = torch.tensor(test_pair.hf.image_sizes)
+    if hf_image_sizes.dim() == 1:
+        hf_image_sizes = hf_image_sizes.unsqueeze(0)
+
+    with torch.no_grad():
+        hf_output_ids = torch_model.generate(
+            input_ids=hf_input_ids,
+            pixel_values=hf_pixel_values,
+            attention_mask=hf_attention_mask,
+            image_sizes=hf_image_sizes,
+            batch_num_images=torch.tensor([num_images]),
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+        )
+
+    prompt_len = hf_input_ids.shape[1]
+    hf_generated_ids = hf_output_ids[0, prompt_len:].cpu().numpy()
+
+    # Levanter generation
+    hf_config = torch_model.config
+    config = LlavaOnevisionConfig.from_hf_config(hf_config)
+    text_config_updated = dataclasses.replace(config.text_config, attn_backend="dot", flash_attention_block_size=None)
+    config = dataclasses.replace(config, text_config=text_config_updated)
+
+    trainer_config = TrainerConfig()
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        mesh = trainer_config.device_mesh
+        compute_dtype = jnp.float32
+        Vocab = Axis("vocab", hf_config.text_config.vocab_size)
+
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=compute_dtype,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        jax_tensors = create_lev_jax_tensors(test_pair.lev, batch_size=1)
+
+        # Configure InferenceEngine
+        prompt_len_lev = len(test_pair.hf.input_ids)
+        estimated_max_seq_len = prompt_len_lev + max_new_tokens + 64
+        page_size = 16
+
+        engine_config = InferenceEngineConfig(
+            max_seq_len=estimated_max_seq_len,
+            page_size=page_size,
+            max_seqs=1,
+            max_rounds=32,
+            max_stop_seqs=1,
+            max_stop_tokens=4,
+            max_pages=800,
+            compute_dtype=compute_dtype,
+        )
+
+        engine = LlavaInferenceEngine.from_model_with_config(
+            model=lev_model,
+            tokenizer=processor.tokenizer,
+            config=engine_config,
+            Vocab=Vocab,
+            mesh=mesh,
+        )
+
+        prompt_tokens = test_pair.hf.input_ids.tolist()
+
+        eos_token_id = processor.tokenizer.eos_token_id
+        if eos_token_id is not None:
+            stop_tokens = hax.named(jnp.array([[eos_token_id]], dtype=jnp.int32), ("stop_seq", "position"))
+        else:
+            stop_tokens = None
+
+        decode_params = SeqDecodingParams(
+            max_num_tokens=estimated_max_seq_len,
+            temperature=0.0,
+            key=random.PRNGKey(42),
+            stop_tokens=stop_tokens,
+        )
+
+        vlm_request = VLMRequest(
+            prompt_tokens=prompt_tokens,
+            request_id=0,
+            decode_params=decode_params,
+            n_generations=1,
+            pixel_values=jax_tensors.pixel_values,
+            input_ids=jax_tensors.input_ids,
+            grid_mask=jax_tensors.grid_mask,
+            unpad_indices=jax_tensors.unpad_indices,
+        )
+
+        result = engine.generate([vlm_request])
+
+    lev_generated_ids = np.array(result.tokens[0])
+
+    # Compare results
+    min_len = min(len(hf_generated_ids), len(lev_generated_ids))
+    matching_tokens = sum(1 for i in range(min_len) if hf_generated_ids[i] == lev_generated_ids[i])
+    match_ratio = matching_tokens / min_len if min_len > 0 else 0
+
+    assert match_ratio >= 0.99, f"Token match ratio too low: {match_ratio:.1%}"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/lib/levanter/tests/test_siglip.py b/lib/levanter/tests/test_siglip.py
new file mode 100644
index 0000000000..5dfe76233a
--- /dev/null
+++ b/lib/levanter/tests/test_siglip.py
@@ -0,0 +1,247 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import tempfile
+
+import equinox as eqx
+import haliax as hax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+from haliax.partitioning import ResourceAxis
+from haliax.state_dict import from_torch_compatible_state_dict
+from jax.sharding import Mesh
+
+from levanter.models.siglip import SiglipVisionConfig, SiglipVisionModel
+from levanter.utils.activation import ActivationFunctionEnum
+from test_image_utils import get_single_image
+from test_utils import use_test_mesh
+
+from test_utils import skip_if_no_torch
+
+
+def _hf_siglip_vision_config():
+    """Return a tiny SiglipVisionConfig for testing."""
+    from transformers import SiglipVisionConfig as HfSiglipVisionConfig
+
+    return HfSiglipVisionConfig(
+        hidden_size=64,
+        intermediate_size=256,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+    )
+
+
+@skip_if_no_torch
+def test_siglip_vision_from_hf_config():
+    """Test conversion from HuggingFace config to Levanter config."""
+    hf_config = _hf_siglip_vision_config()
+    config = SiglipVisionConfig.from_hf_config(hf_config)
+
+    assert config.hidden_size == hf_config.hidden_size
+    assert config.intermediate_size == hf_config.intermediate_size
+    assert config.num_hidden_layers == hf_config.num_hidden_layers
+    assert config.num_attention_heads == hf_config.num_attention_heads
+    assert config.hidden_act == ActivationFunctionEnum.gelu_new
+
+
+@skip_if_no_torch
+def test_siglip_vision_to_hf_config():
+    """Test conversion from Levanter config to HuggingFace config."""
+    config = SiglipVisionConfig(
+        hidden_size=64,
+        intermediate_size=256,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        hidden_act=ActivationFunctionEnum.gelu_new,
+    )
+
+    hf_config = config.to_hf_config()
+
+    assert hf_config.hidden_size == config.hidden_size
+    assert hf_config.intermediate_size == config.intermediate_size
+    assert hf_config.num_hidden_layers == config.num_hidden_layers
+    assert hf_config.hidden_act == "gelu_pytorch_tanh"
+
+
+@skip_if_no_torch
+def test_siglip_vision_config_roundtrip():
+    """Test that converting HF -> Levanter -> HF preserves the config."""
+    hf_config_1 = _hf_siglip_vision_config()
+    levanter_config = SiglipVisionConfig.from_hf_config(hf_config_1)
+    hf_config_2 = levanter_config.to_hf_config()
+
+    assert hf_config_2 == hf_config_1
+
+
+@skip_if_no_torch
+def test_siglip_vision_activation_function_conversion():
+    """Test various activation function conversions between HF and Levanter."""
+    from transformers import SiglipVisionConfig as HfSiglipVisionConfig
+
+    test_cases = [
+        ("gelu_pytorch_tanh", ActivationFunctionEnum.gelu_new),
+        ("gelu", ActivationFunctionEnum.gelu),
+        ("quick_gelu", ActivationFunctionEnum.quick_gelu),
+    ]
+
+    for hf_act, expected_lev_act in test_cases:
+        hf_config = HfSiglipVisionConfig(hidden_act=hf_act)
+        levanter_config = SiglipVisionConfig.from_hf_config(hf_config)
+        assert levanter_config.hidden_act == expected_lev_act
+
+
+@skip_if_no_torch
+def test_siglip_vision_embeddings_vs_hf():
+    """Compare SiglipVisionModel output with HuggingFace using a small model."""
+    import torch
+    from transformers import SiglipVisionConfig as HfSiglipVisionConfig
+    from transformers import SiglipVisionModel as HfSiglipVisionModel
+
+    # Set JAX to use float32 matmul precision
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    hf_config = HfSiglipVisionConfig(
+        hidden_size=256,
+        intermediate_size=512,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+    )
+
+    torch.manual_seed(42)
+    hf_model = HfSiglipVisionModel(hf_config)
+    hf_model.eval()
+
+    batch_size = 2
+    pixel_values_torch = torch.randn(batch_size, 3, 224, 224)
+
+    with torch.no_grad():
+        hf_output = hf_model(pixel_values_torch, output_hidden_states=True)
+        hf_last_hidden_np = hf_output.last_hidden_state.detach().cpu().numpy()
+
+    lev_config = SiglipVisionConfig.from_hf_config(hf_config)
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with tempfile.TemporaryDirectory() as tmpdir, use_test_mesh(mesh=single_device_mesh):
+        hf_model.save_pretrained(f"{tmpdir}/hf_model")
+
+        Vocab = hax.Axis("vocab", 1)
+        model_template = eqx.filter_eval_shape(SiglipVisionModel.init, Vocab, lev_config, key=jax.random.PRNGKey(0))
+
+        converter = lev_config.hf_checkpoint_converter(ref_checkpoint=f"{tmpdir}/hf_model")
+        state_dict = converter.load_state_dict(f"{tmpdir}/hf_model")
+        lev_model = from_torch_compatible_state_dict(model_template, state_dict)
+
+        Batch = hax.Axis("batch", batch_size)
+        Channels = hax.Axis("channels", 3)
+        Height = hax.Axis("height", 224)
+        Width = hax.Axis("width", 224)
+
+        pixel_values_jax = hax.named(
+            jnp.array(pixel_values_torch.numpy(), dtype=jnp.float32), (Batch, Channels, Height, Width)
+        )
+
+        lev_output = lev_model(pixel_values_jax, output_hidden_states=True, key=jax.random.PRNGKey(1))
+
+    lev_last_hidden_np = np.array(lev_output.last_hidden_state.array)
+
+    # 4-layer model: use 1e-3 tolerance
+    assert np.allclose(hf_last_hidden_np, lev_last_hidden_np, rtol=1e-3, atol=1e-3)
+
+
+@skip_if_no_torch
+def test_siglip_vision_real_image():
+    """Test SigLIP vision model with real image using HF processor."""
+    import torch
+    from transformers import SiglipImageProcessor
+    from transformers import SiglipVisionModel as HfSiglipVisionModel
+
+    # Set JAX to use float32 matmul precision
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    image = get_single_image()
+    model_name = "google/siglip-base-patch16-224"
+
+    try:
+        processor = SiglipImageProcessor.from_pretrained(model_name)
+        torch_model = HfSiglipVisionModel.from_pretrained(model_name)
+        torch_model.eval()
+        torch_model = torch_model.float()
+    except Exception as e:
+        pytest.skip(f"Failed to load HF model/processor: {e}")
+
+    inputs = processor(images=image, return_tensors="pt")
+    pixel_values_torch = inputs["pixel_values"].float()
+
+    with torch.no_grad():
+        vision_outputs = torch_model(pixel_values_torch, output_hidden_states=True)
+        torch_output = vision_outputs.last_hidden_state.detach().cpu().numpy()
+
+    lev_config = SiglipVisionConfig.from_hf_config(torch_model.config)
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with tempfile.TemporaryDirectory() as tmpdir, use_test_mesh(mesh=single_device_mesh):
+        torch_model.save_pretrained(f"{tmpdir}/hf_model")
+
+        Vocab = hax.Axis("vocab", 1)
+        model_template = eqx.filter_eval_shape(SiglipVisionModel.init, Vocab, lev_config, key=jax.random.PRNGKey(0))
+
+        converter = lev_config.hf_checkpoint_converter(ref_checkpoint=f"{tmpdir}/hf_model")
+        state_dict = converter.load_state_dict(f"{tmpdir}/hf_model")
+        lev_model = from_torch_compatible_state_dict(model_template, state_dict)
+
+        pixel_values_np = pixel_values_torch.cpu().numpy()
+        batch_size, num_channels, height, width = pixel_values_np.shape
+
+        Batch = hax.Axis("batch", batch_size)
+        Channels = hax.Axis("channels", num_channels)
+        Height = hax.Axis("height", height)
+        Width = hax.Axis("width", width)
+
+        pixel_values_jax = hax.named(jnp.array(pixel_values_np, dtype=jnp.float32), (Batch, Channels, Height, Width))
+
+        lev_output = lev_model(pixel_values_jax, output_hidden_states=True, key=jax.random.PRNGKey(1))
+
+    lev_output_np = np.array(lev_output.last_hidden_state.array)
+
+    assert torch_output.shape == lev_output_np.shape
+    assert not np.any(np.isnan(lev_output_np))
+    assert not np.any(np.isinf(lev_output_np))
+
+    # 12-layer full model: numerical differences compound, use looser tolerance
+    assert np.allclose(torch_output, lev_output_np, rtol=1e-3, atol=1e-2)
+
+    # Test Levanter -> HF conversion
+    with tempfile.TemporaryDirectory() as tmpdir:
+        save_path = f"{tmpdir}/converted_model"
+
+        converter = lev_config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        converter.save_pretrained(lev_model, save_path, save_tokenizer=False)
+
+        converted_hf_model = HfSiglipVisionModel.from_pretrained(save_path)
+        converted_hf_model.eval()
+        converted_hf_model = converted_hf_model.float()
+
+        with torch.no_grad():
+            converted_outputs = converted_hf_model(pixel_values_torch)
+            converted_output_np = converted_outputs.last_hidden_state.detach().cpu().numpy()
+
+        assert not np.any(np.isnan(converted_output_np))
+        assert np.allclose(lev_output_np, converted_output_np, rtol=1e-3, atol=1e-2)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/lib/levanter/tests/test_siglip2.py b/lib/levanter/tests/test_siglip2.py
new file mode 100644
index 0000000000..fe1db7f600
--- /dev/null
+++ b/lib/levanter/tests/test_siglip2.py
@@ -0,0 +1,290 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import tempfile
+
+import equinox as eqx
+import haliax as hax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+from haliax.partitioning import ResourceAxis
+from haliax.state_dict import from_torch_compatible_state_dict
+from jax import random
+from jax.sharding import Mesh
+
+from levanter.models.siglip2 import Siglip2VisionConfig, Siglip2VisionModel
+from levanter.utils.activation import ActivationFunctionEnum
+from test_image_utils import get_single_image
+from test_utils import use_test_mesh
+
+from test_utils import skip_if_no_torch
+
+
+def _hf_siglip2_vision_config():
+    """Return a tiny Siglip2VisionConfig for testing."""
+    from transformers import Siglip2VisionConfig as HfSiglip2VisionConfig
+
+    return HfSiglip2VisionConfig(
+        hidden_size=64,
+        intermediate_size=256,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        num_channels=3,
+        num_patches=256,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+    )
+
+
+@skip_if_no_torch
+def test_siglip2_vision_from_hf_config():
+    """Test conversion from HuggingFace config to Levanter config."""
+    hf_config = _hf_siglip2_vision_config()
+    config = Siglip2VisionConfig.from_hf_config(hf_config)
+
+    assert config.hidden_size == hf_config.hidden_size
+    assert config.intermediate_size == hf_config.intermediate_size
+    assert config.num_hidden_layers == hf_config.num_hidden_layers
+    assert config.num_attention_heads == hf_config.num_attention_heads
+    assert config.hidden_act == ActivationFunctionEnum.gelu_new
+
+
+@skip_if_no_torch
+def test_siglip2_vision_to_hf_config():
+    """Test conversion from Levanter config to HuggingFace config."""
+    config = Siglip2VisionConfig(
+        hidden_size=64,
+        intermediate_size=256,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        num_channels=3,
+        num_patches=256,
+        patch_size=16,
+        hidden_act=ActivationFunctionEnum.gelu_new,
+    )
+
+    hf_config = config.to_hf_config()
+
+    assert hf_config.hidden_size == config.hidden_size
+    assert hf_config.intermediate_size == config.intermediate_size
+    assert hf_config.num_hidden_layers == config.num_hidden_layers
+    assert hf_config.hidden_act == "gelu_pytorch_tanh"
+
+
+@skip_if_no_torch
+def test_siglip2_vision_config_roundtrip():
+    """Test that converting HF -> Levanter -> HF preserves the config."""
+    hf_config_1 = _hf_siglip2_vision_config()
+    levanter_config = Siglip2VisionConfig.from_hf_config(hf_config_1)
+    hf_config_2 = levanter_config.to_hf_config()
+
+    assert hf_config_2.hidden_size == hf_config_1.hidden_size
+    assert hf_config_2.intermediate_size == hf_config_1.intermediate_size
+    assert hf_config_2.num_hidden_layers == hf_config_1.num_hidden_layers
+    assert hf_config_2.num_attention_heads == hf_config_1.num_attention_heads
+    assert hf_config_2.hidden_act == hf_config_1.hidden_act
+
+
+@skip_if_no_torch
+def test_siglip2_vision_activation_function_mapping():
+    """Test that various activation functions are correctly mapped."""
+    from transformers import Siglip2VisionConfig as HfSiglip2VisionConfig
+
+    test_cases = [
+        ("gelu_pytorch_tanh", ActivationFunctionEnum.gelu_new),
+        ("gelu", ActivationFunctionEnum.gelu),
+        ("quick_gelu", ActivationFunctionEnum.quick_gelu),
+    ]
+
+    for hf_act, expected_lev_act in test_cases:
+        hf_config = HfSiglip2VisionConfig(hidden_act=hf_act)
+        levanter_config = Siglip2VisionConfig.from_hf_config(hf_config)
+        assert levanter_config.hidden_act == expected_lev_act
+
+
+@skip_if_no_torch
+def test_siglip2_vision_roundtrip():
+    """Test loading HuggingFace weights into Levanter and roundtrip conversion."""
+    import torch
+    from transformers import Siglip2VisionModel as HfSiglip2VisionModel
+
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    hf_config = _hf_siglip2_vision_config()
+    torch.random.manual_seed(0)
+    torch_model = HfSiglip2VisionModel(hf_config)
+    torch_model.eval()
+
+    batch_size = 2
+    num_patches = 64
+    patch_input_dim = hf_config.num_channels * hf_config.patch_size * hf_config.patch_size
+    pixel_values_torch = torch.randn(batch_size, num_patches, patch_input_dim, dtype=torch.float32)
+
+    # Run HF model through encoder
+    with torch.no_grad():
+        hf_vision = torch_model.vision_model
+        patch_embeds = hf_vision.embeddings.patch_embedding(pixel_values_torch)
+        position_ids = torch.arange(num_patches)
+        pos_embeds = hf_vision.embeddings.position_embedding(position_ids)
+        hidden_states = patch_embeds + pos_embeds
+
+        attention_mask = torch.ones(batch_size, 1, num_patches, num_patches)
+        encoder_output = hf_vision.encoder(hidden_states, attention_mask=attention_mask)
+        hidden_states = encoder_output.last_hidden_state
+        torch_output = hf_vision.post_layernorm(hidden_states).detach().cpu().numpy()
+
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with tempfile.TemporaryDirectory() as tmpdir, use_test_mesh(mesh=single_device_mesh):
+        torch_model.save_pretrained(f"{tmpdir}/torch_model")
+
+        config = Siglip2VisionConfig.from_hf_config(hf_config)
+        converter = config.hf_checkpoint_converter(ref_checkpoint=f"{tmpdir}/torch_model")
+
+        Vocab = hax.Axis("vocab", 1)
+        model_template = eqx.filter_eval_shape(Siglip2VisionModel.init, Vocab, config, key=random.PRNGKey(0))
+        state_dict = converter.load_state_dict(f"{tmpdir}/torch_model")
+        model = from_torch_compatible_state_dict(model_template, state_dict)
+
+        Batch = hax.Axis("batch", batch_size)
+        NumPatches = hax.Axis("num_patches", num_patches)
+        PatchInput = hax.Axis("patch_input", patch_input_dim)
+
+        pixel_values = hax.named(
+            jnp.array(pixel_values_torch.numpy(), dtype=jnp.float32),
+            (Batch, NumPatches, PatchInput),
+        )
+
+        jax_output = model(pixel_values, key=None)
+        jax_output_array = jax_output.last_hidden_state.array
+
+        # Multi-layer model: use 1e-2 tolerance
+        assert torch_output.shape == jax_output_array.shape
+        assert np.allclose(torch_output, jax_output_array, rtol=1e-3, atol=1e-2)
+
+        # Test roundtrip: save Levanter model and load back as HF
+        converter.save_pretrained(model, f"{tmpdir}/lev_model", save_reference_code=False)
+        torch_model2 = HfSiglip2VisionModel.from_pretrained(f"{tmpdir}/lev_model")
+        torch_model2.eval()
+
+        with torch.no_grad():
+            hf_vision2 = torch_model2.vision_model
+            patch_embeds = hf_vision2.embeddings.patch_embedding(pixel_values_torch)
+            pos_embeds = hf_vision2.embeddings.position_embedding(position_ids)
+            hidden_states = patch_embeds + pos_embeds
+            encoder_output = hf_vision2.encoder(hidden_states, attention_mask=attention_mask)
+            hidden_states = encoder_output.last_hidden_state
+            torch_output2 = hf_vision2.post_layernorm(hidden_states).detach().cpu().numpy()
+
+        assert np.allclose(torch_output2, jax_output_array, rtol=1e-3, atol=1e-2)
+
+
+@skip_if_no_torch
+def test_siglip2_vision_real_image():
+    """Test Siglip2 vision model with real image using HF processor."""
+    import torch
+    from transformers import AutoModel, AutoProcessor
+
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    image = get_single_image()
+    model_name = "google/siglip2-so400m-patch16-naflex"
+
+    try:
+        processor = AutoProcessor.from_pretrained(model_name)
+        torch_model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32)
+        torch_model.eval()
+        torch_model = torch_model.float()
+    except Exception as e:
+        pytest.skip(f"Failed to load HF model/processor: {e}")
+
+    inputs = processor(images=image, return_tensors="pt")
+    pixel_values_torch = inputs["pixel_values"].float()
+    batch_size = pixel_values_torch.shape[0]
+    num_patches = pixel_values_torch.shape[1]
+
+    if "spatial_shapes" in inputs:
+        spatial_shapes = inputs["spatial_shapes"]
+    else:
+        grid_size = int(num_patches**0.5)
+        spatial_shapes = torch.tensor([[grid_size, grid_size]] * batch_size, dtype=torch.long)
+
+    # Run HF model
+    with torch.no_grad():
+        if hasattr(torch_model, "vision_model"):
+            hf_vision = torch_model.vision_model
+            hf_config = torch_model.config.vision_config
+        else:
+            hf_vision = torch_model
+            hf_config = torch_model.config
+
+        attention_mask = torch.ones(batch_size, num_patches, dtype=torch.long)
+        vision_outputs = hf_vision(pixel_values_torch, attention_mask=attention_mask, spatial_shapes=spatial_shapes)
+        torch_output = vision_outputs.last_hidden_state.detach().cpu().numpy()
+
+    single_device_mesh = Mesh(np.array([[jax.devices()[0]]]), (ResourceAxis.DATA, ResourceAxis.MODEL))
+
+    with tempfile.TemporaryDirectory() as tmpdir, use_test_mesh(mesh=single_device_mesh):
+        torch_model.save_pretrained(f"{tmpdir}/torch_model")
+
+        config = Siglip2VisionConfig.from_hf_config(hf_config)
+        converter = config.hf_checkpoint_converter(ref_checkpoint=f"{tmpdir}/torch_model")
+
+        Vocab = hax.Axis("vocab", 1)
+        model_template = eqx.filter_eval_shape(Siglip2VisionModel.init, Vocab, config, key=random.PRNGKey(0))
+        state_dict = converter.load_state_dict(f"{tmpdir}/torch_model")
+        model = from_torch_compatible_state_dict(model_template, state_dict)
+
+        pixel_values_np = pixel_values_torch.cpu().numpy().astype(np.float32)
+        Batch = hax.Axis("batch", batch_size)
+        NumPatches = hax.Axis("num_patches", num_patches)
+        PatchInput = hax.Axis("patch_input", pixel_values_np.shape[2])
+
+        pixel_values = hax.named(jnp.array(pixel_values_np, dtype=jnp.float32), (Batch, NumPatches, PatchInput))
+        spatial_shapes_np = spatial_shapes.cpu().numpy()
+
+        jax_output = model(pixel_values, spatial_shapes=spatial_shapes_np)
+        jax_output_array = jax_output.last_hidden_state.array
+
+    assert torch_output.shape == jax_output_array.shape
+    assert not np.any(np.isnan(jax_output_array))
+    assert not np.any(np.isinf(jax_output_array))
+
+    # Multi-layer full model: use percentile-based check
+    diff = np.abs(torch_output - jax_output_array)
+    p99_diff = np.percentile(diff, 99)
+    assert p99_diff < 0.1, f"P99 diff too large: {p99_diff}"
+
+    # Test Levanter -> HF conversion
+    with tempfile.TemporaryDirectory() as tmpdir:
+        save_path = f"{tmpdir}/converted_model"
+        converter = config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        converter.save_pretrained(model, save_path, save_tokenizer=False)
+
+        converted_hf_model = AutoModel.from_pretrained(save_path, trust_remote_code=True)
+        converted_hf_model.eval()
+        converted_hf_model = converted_hf_model.float()
+
+        with torch.no_grad():
+            if hasattr(converted_hf_model, "vision_model"):
+                converted_vision = converted_hf_model.vision_model
+            else:
+                converted_vision = converted_hf_model
+
+            converted_outputs = converted_vision(
+                pixel_values_torch, attention_mask=attention_mask, spatial_shapes=spatial_shapes
+            )
+            converted_output_np = converted_outputs.last_hidden_state.detach().cpu().numpy()
+
+        assert not np.any(np.isnan(converted_output_np))
+        diff_lev_hf = np.abs(jax_output_array - converted_output_np)
+        p99_diff_lev_hf = np.percentile(diff_lev_hf, 99)
+        assert p99_diff_lev_hf < 0.1, f"Levanter -> HF p99 diff too large: {p99_diff_lev_hf}"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/lib/levanter/tests/test_train_image.py b/lib/levanter/tests/test_train_image.py
new file mode 100644
index 0000000000..26eb1a4918
--- /dev/null
+++ b/lib/levanter/tests/test_train_image.py
@@ -0,0 +1,420 @@
+# Copyright 2025 The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Test training for vision-language models (LLaVA OneVision).
+
+This test validates the training pipeline for image-text models.
+"""
+
+import dataclasses
+import tempfile
+
+import equinox as eqx
+import haliax as hax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+from transformers import AutoConfig
+
+from levanter.main.train_vlm import compute_vlm_loss
+from levanter.models.llava_onevision import LlavaOnevisionConfig
+from test_image_utils import (
+    prepare_test_data,
+    compare_logits_by_region,
+    create_lev_jax_tensors,
+    DEFAULT_GRID_PINPOINTS,
+    get_real_data,
+)
+
+from test_utils import skip_if_no_torch
+
+MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"
+MODEL_NAME_7B = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+MAX_LENGTH = 8192
+
+
+def _load_levanter_config(model_name=MODEL_NAME, enable_flash_attention=False, gradient_checkpointing=True):
+    """Load and configure LlavaOnevisionConfig."""
+    from levanter.layers.attention import AttentionBackend
+
+    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    lev_config = LlavaOnevisionConfig.from_hf_config(hf_config)
+
+    vision_config_updated = dataclasses.replace(
+        lev_config.vision_config,
+        use_flash_attention=False,
+        gradient_checkpointing=gradient_checkpointing,
+    )
+
+    attn_backend = AttentionBackend.DEFAULT if enable_flash_attention else AttentionBackend.VANILLA
+    text_config_updated = dataclasses.replace(
+        lev_config.text_config,
+        attn_backend=attn_backend,
+        gradient_checkpointing=gradient_checkpointing,
+    )
+
+    return dataclasses.replace(
+        lev_config,
+        vision_config=vision_config_updated,
+        text_config=text_config_updated,
+        gradient_checkpointing=gradient_checkpointing,
+    )
+
+
+@pytest.mark.entry
+@skip_if_no_torch
+def test_vlm_numerical_correctness():
+    """Verify numerical correctness of Levanter VLM vs HuggingFace implementation."""
+    import torch
+    from transformers import AutoModelForVision2Seq
+    from levanter.models.llava_onevision import LlavaOnevisionModel
+    from levanter.trainer import TrainerConfig
+
+    # Set JAX to use float32 matmul precision
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    model_name = MODEL_NAME
+    grid_pinpoints = DEFAULT_GRID_PINPOINTS
+    num_samples = 4
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hf_dataset = get_real_data()
+        parquet_path = f"{tmpdir}/test_data.parquet"
+        hf_dataset.to_parquet(parquet_path)
+
+        test_pairs = prepare_test_data(
+            parquet_path=parquet_path,
+            sample_indices=list(range(num_samples)),
+            model_name=model_name,
+            max_length=MAX_LENGTH,
+            max_num_patches=9,
+            grid_pinpoints=grid_pinpoints,
+        )
+
+    hf_model = AutoModelForVision2Seq.from_pretrained(
+        model_name,
+        torch_dtype=torch.float32,
+        trust_remote_code=True,
+    )
+    hf_model.model.config.image_grid_pinpoints = grid_pinpoints
+    hf_model.model.image_newline = None
+    hf_model.eval()
+
+    lev_config = _load_levanter_config(model_name, enable_flash_attention=False, gradient_checkpointing=False)
+    trainer_config = TrainerConfig()
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        compute_dtype = jnp.float32
+        converter = lev_config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        parameter_axis_mapping = trainer_config.parameter_axis_mapping
+
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=lev_config,
+            axis_mapping=parameter_axis_mapping,
+            dtype=compute_dtype,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        @eqx.filter_jit
+        def compute_forward(model, input_ids, pixel_values, grid_mask, unpad_indices):
+            return model(
+                input_ids, pixel_values=pixel_values, grid_mask=grid_mask, unpad_indices=unpad_indices, key=None
+            )
+
+        all_passed = []
+
+        for sample_idx, pair in enumerate(test_pairs):
+            hf_input_ids = torch.from_numpy(pair.hf.input_ids).unsqueeze(0)
+            hf_pixel_values = torch.from_numpy(pair.hf.pixel_values).unsqueeze(0)
+            hf_image_sizes = torch.from_numpy(pair.hf.image_sizes).unsqueeze(0)
+
+            with torch.no_grad():
+                hf_output = hf_model(
+                    input_ids=hf_input_ids,
+                    pixel_values=hf_pixel_values,
+                    image_sizes=hf_image_sizes,
+                )
+                hf_logits = hf_output.logits[0].numpy()
+
+            jax_tensors = create_lev_jax_tensors(pair.lev, batch_size=1)
+            lev_logits = compute_forward(
+                lev_model,
+                jax_tensors.input_ids,
+                jax_tensors.pixel_values,
+                jax_tensors.grid_mask,
+                jax_tensors.unpad_indices,
+            )
+            lev_logits_np = np.array(lev_logits.array)[0]
+
+            image_token_id = hf_model.config.image_token_index
+            result = compare_logits_by_region(
+                hf_logits=hf_logits,
+                lev_logits=lev_logits_np,
+                input_ids=pair.hf.input_ids,
+                image_token_id=image_token_id,
+                tolerance=1e-3,
+                verbose=False,
+            )
+            all_passed.append(result.passed)
+
+        assert all(all_passed), f"Not all samples passed: {sum(all_passed)}/{len(all_passed)}"
+
+
+@pytest.mark.entry
+@skip_if_no_torch
+def test_vlm_loss_and_gradients():
+    """Test loss computation and gradient flow through all VLM components."""
+    from levanter.models.llava_onevision import LlavaOnevisionModel
+    from levanter.trainer import TrainerConfig
+
+    # Set JAX to use float32 matmul precision
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    model_name = MODEL_NAME
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hf_dataset = get_real_data()
+        parquet_path = f"{tmpdir}/test_data.parquet"
+        hf_dataset.to_parquet(parquet_path)
+
+        test_pairs = prepare_test_data(
+            parquet_path=parquet_path,
+            sample_indices=[0],
+            model_name=model_name,
+            max_length=MAX_LENGTH,
+            max_num_patches=9,
+            grid_pinpoints=DEFAULT_GRID_PINPOINTS,
+        )
+        pair = test_pairs[0]
+
+    lev_config = _load_levanter_config(model_name, enable_flash_attention=True, gradient_checkpointing=True)
+    trainer_config = TrainerConfig(per_device_parallelism=1)
+
+    with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+        converter = lev_config.hf_checkpoint_converter(ref_checkpoint=model_name)
+        lev_model = converter.load_pretrained(
+            LlavaOnevisionModel,
+            ref=model_name,
+            config=lev_config,
+            axis_mapping=trainer_config.parameter_axis_mapping,
+            dtype=jnp.float32,
+            resize_vocab_to_match_tokenizer=False,
+        )
+
+        jax_tensors = create_lev_jax_tensors(pair.lev, batch_size=8)
+        from levanter.data.image import ImageTextExample as ImgTextEx
+
+        batch_example = ImgTextEx(
+            pixel_values=jax_tensors.pixel_values,
+            input_ids=jax_tensors.input_ids,
+            loss_mask=jax_tensors.loss_mask,
+            grid_mask=jax_tensors.grid_mask,
+            unpad_indices=jax_tensors.unpad_indices,
+        )
+
+        def compute_loss(model):
+            loss = compute_vlm_loss(model, batch_example, key=None)
+            return loss.scalar()
+
+        loss_value, grads = eqx.filter_value_and_grad(compute_loss)(lev_model)
+        loss_value = float(loss_value)
+
+        # Verify loss is reasonable
+        assert not np.isnan(loss_value), "Loss is NaN"
+        assert 0.0 < loss_value < 100.0, f"Loss out of range: {loss_value}"
+
+        # Verify gradients flow to all components
+        grad_dict = hax.state_dict.to_torch_compatible_state_dict(grads)
+        has_vision_grads = any("vision_tower" in k for k in grad_dict if grad_dict[k] is not None)
+        has_projector_grads = any("multi_modal_projector" in k for k in grad_dict if grad_dict[k] is not None)
+        has_lm_grads = any("language_model" in k for k in grad_dict if grad_dict[k] is not None)
+
+        assert has_vision_grads, "Vision tower should have gradients"
+        assert has_projector_grads, "Projector should have gradients"
+        assert has_lm_grads, "Language model should have gradients"
+
+        # Check gradient norms are reasonable (filter out zero gradients from disabled params like biases)
+        all_norms = [float(np.linalg.norm(g)) for g in grad_dict.values() if g is not None]
+        nonzero_norms = [n for n in all_norms if n > 0]
+        assert max(all_norms) < 1e6, f"Gradient explosion: max norm = {max(all_norms)}"
+        assert min(nonzero_norms) > 1e-12, f"Gradient vanishing: min norm = {min(nonzero_norms)}"
+
+
+@pytest.mark.entry
+@skip_if_no_torch
+def test_vlm_training_reproducibility():
+    """Verify training steps are reproducible with same seed."""
+    from levanter.models.llava_onevision import LlavaOnevisionModel
+    from levanter.trainer import TrainerConfig
+
+    # Set JAX to use float32 matmul precision
+    jax.config.update("jax_default_matmul_precision", "float32")
+
+    model_name = MODEL_NAME
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hf_dataset = get_real_data()
+        parquet_path = f"{tmpdir}/test_data.parquet"
+        hf_dataset.to_parquet(parquet_path)
+
+        test_pairs = prepare_test_data(
+            parquet_path=parquet_path,
+            sample_indices=[0],
+            model_name=model_name,
+            max_length=MAX_LENGTH,
+            max_num_patches=9,
+            grid_pinpoints=DEFAULT_GRID_PINPOINTS,
+        )
+        pair = test_pairs[0]
+
+    lev_config = _load_levanter_config(model_name, enable_flash_attention=True, gradient_checkpointing=True)
+    trainer_config = TrainerConfig(per_device_parallelism=1)
+
+    def run_training_step():
+        with trainer_config.use_device_mesh(), hax.axis_mapping(trainer_config.compute_axis_mapping):
+            converter = lev_config.hf_checkpoint_converter(ref_checkpoint=model_name)
+            model = converter.load_pretrained(
+                LlavaOnevisionModel,
+                ref=model_name,
+                config=lev_config,
+                axis_mapping=trainer_config.parameter_axis_mapping,
+                dtype=jnp.float32,
+                resize_vocab_to_match_tokenizer=False,
+            )
+
+            jax_tensors = create_lev_jax_tensors(pair.lev, batch_size=8)
+            from levanter.data.image import ImageTextExample as ImgTextEx
+
+            batch_example = ImgTextEx(
+                pixel_values=jax_tensors.pixel_values,
+                input_ids=jax_tensors.input_ids,
+                loss_mask=jax_tensors.loss_mask,
+                grid_mask=jax_tensors.grid_mask,
+                unpad_indices=jax_tensors.unpad_indices,
+            )
+
+            def compute_loss(model):
+                return compute_vlm_loss(model, batch_example, key=None).scalar()
+
+            loss, grads = eqx.filter_value_and_grad(compute_loss)(model)
+            grad_dict = hax.state_dict.to_torch_compatible_state_dict(grads)
+            sample_grad = grad_dict.get("language_model.lm_head.weight", None)
+            return float(loss), sample_grad
+
+    loss1, grads1 = run_training_step()
+    loss2, grads2 = run_training_step()
+
+    assert loss1 == loss2, f"Losses not identical: {loss1} vs {loss2}"
+    if grads1 is not None and grads2 is not None:
+        assert np.max(np.abs(grads1 - grads2)) == 0.0, "Gradients not identical"
+
+
+@pytest.mark.entry
+@skip_if_no_torch
+def test_vlm_loss_mask():
+    """Verify loss masking correctly excludes user prompts."""
+    model_name = MODEL_NAME
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hf_dataset = get_real_data()
+        parquet_path = f"{tmpdir}/test_data.parquet"
+        hf_dataset.to_parquet(parquet_path)
+
+        test_pairs = prepare_test_data(
+            parquet_path=parquet_path,
+            sample_indices=[0],
+            model_name=model_name,
+            max_length=MAX_LENGTH,
+            max_num_patches=9,
+            grid_pinpoints=DEFAULT_GRID_PINPOINTS,
+        )
+        pair = test_pairs[0]
+
+    loss_mask_np = np.array(pair.lev.loss_mask)
+    total_positions = len(loss_mask_np)
+    unmasked_positions = np.sum(loss_mask_np == 1.0)
+
+    # Verify unmasked positions exist (assistant response)
+    assert unmasked_positions > 0, "No unmasked positions - training would have no signal"
+    # Verify most positions are masked (user prompt + image tokens + padding)
+    assert unmasked_positions < total_positions * 0.5, "Too many unmasked positions"
+
+
+@skip_if_no_torch
+def test_text_only_conversation():
+    """Test BatchImageProcessor with text-only conversations."""
+    from transformers import AutoProcessor
+    from levanter.data.image import BatchImageProcessor
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME)
+    bp = BatchImageProcessor(processor, max_length=2048, padding=True)
+
+    messages = [
+        {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
+        {"role": "assistant", "content": [{"type": "text", "text": "The capital of France is Paris."}]},
+    ]
+
+    results = bp([{"messages": messages, "images": []}])
+    result = results[0]
+
+    assert result["pixel_values"] is None, "Text-only should have None pixel_values"
+    assert np.sum(result["loss_mask"] == 1.0) > 0, "Should have unmasked positions for assistant response"
+
+
+@skip_if_no_torch
+def test_replace_tokenizer_qwen3():
+    """Test tokenizer replacement with Qwen3 for thinking tokens and image tokens."""
+    from transformers import AutoProcessor, AutoTokenizer
+    from levanter.data.image import BatchImageProcessor, CustomVLMProcessor
+    from PIL import Image
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME_7B)
+    llm_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
+
+    # Create BatchImageProcessor with Qwen3 tokenizer
+    bp = BatchImageProcessor(processor, tokenizer=llm_tokenizer, max_length=2048)
+
+    # Verify CustomVLMProcessor is used with new tokenizer
+    assert isinstance(bp.processor, CustomVLMProcessor)
+    assert bp.processor.tokenizer is llm_tokenizer
+
+    # Verify thinking tokens encode correctly (Qwen3-specific)
+    text_with_thinking = "<think>Let me think...</think>Answer is 42."
+    original_encoding = processor.tokenizer.encode(text_with_thinking, add_special_tokens=False)
+    new_encoding = bp.processor.tokenizer.encode(text_with_thinking, add_special_tokens=False)
+
+    think_token_id = 151667
+    end_think_token_id = 151668
+
+    assert think_token_id not in original_encoding, "Original should not have <think> as single token"
+    assert think_token_id in new_encoding, "Qwen3 should have <think> as single token"
+    assert end_think_token_id in new_encoding, "Qwen3 should have </think> as single token"
+
+    # Verify image token uses Qwen3's <|image_pad|>
+    assert bp.processor.image_token == "<|image_pad|>"
+    assert bp.processor.image_token_id == llm_tokenizer.convert_tokens_to_ids("<|image_pad|>")
+
+    # Verify processing works with thinking tokens
+    test_image = Image.new("RGB", (100, 100), color="blue")
+    messages = [
+        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What is this?"}]},
+        {"role": "assistant", "content": [{"type": "text", "text": "<think>Analyzing...</think>A blue square."}]},
+    ]
+
+    results = bp([{"messages": messages, "images": [test_image]}])
+    result = results[0]
+
+    assert result["pixel_values"] is not None
+    assert think_token_id in result["input_ids"]
+
+    # Verify get_token_ids returns updated values
+    token_ids = bp.get_token_ids()
+    assert token_ids["image_token_id"] == bp.processor.image_token_id
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])