diff --git a/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml b/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml index 574db88263..258f82c4bc 100644 --- a/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml +++ b/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml @@ -1,15 +1,20 @@ defaults: ../../grpo_math_1B.yaml grpo: - num_prompts_per_step: 128 + num_prompts_per_step: 64 + num_generations_per_prompt: 16 policy: - model_name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 + model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf tokenizer: name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 - max_total_sequence_length: 1024 - train_global_batch_size: 128 + max_total_sequence_length: 24576 + #max_total_sequence_length: 1024 + train_global_batch_size: 64 + train_micro_batch_size: 1 + logprob_batch_size: 2 dtensor_cfg: activation_checkpointing: true - tensor_parallel_size: 8 + context_parallel_size: 4 + tensor_parallel_size: 2 custom_parallel_plan: examples.configs.recipes.llm.llama_nemotron_super_49b_custom_plan.custom_parallel_plan dynamic_batching: enabled: true @@ -32,16 +37,19 @@ policy: - 13 generation: vllm_cfg: + async_engine: false tensor_parallel_size: 4 + #pipeline_parallel_size: 2 + make_sequence_length_divisible_by: ${max:${mul:${policy.dtensor_cfg.context_parallel_size}, 2}, ${policy.max_total_sequence_length}} logger: wandb_enabled: true monitor_gpus: false wandb: project: grpo-nemotron-super-49b - name: grpo-${data.dataset_name}-nemotron-super-49b-tp${policy.dtensor_cfg.tensor_parallel_size} + name: grpo-${data.dataset_name}-nemotron-super-49b-tp${policy.dtensor_cfg.tensor_parallel_size}-cp${policy.dtensor_cfg.context_parallel_size} mlflow: experiment_name: sft-dev run_name: grpo-nemotron-super-49b cluster: gpus_per_node: 8 - num_nodes: 4 + num_nodes: 8 diff --git a/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py b/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py index a0381adf9c..2922c69f9e 100644 --- a/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py +++ b/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py @@ -12,38 +12,73 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import cast + from torch.distributed.tensor.parallel import ( ColwiseParallel, ParallelStyle, - PrepareModuleInput, - PrepareModuleOutput, RowwiseParallel, + SequenceParallel, ) from torch.distributed.tensor.placement_types import Replicate, Shard -custom_parallel_plan: dict[str, ParallelStyle] = { - "model.layers.*.self_attn": PrepareModuleInput( - input_kwarg_layouts={"attention_mask": Replicate()}, - desired_input_kwarg_layouts={"attention_mask": Replicate()}, - ), - "model.embed_tokens": RowwiseParallel( - input_layouts=Replicate(), output_layouts=Replicate(), use_local_output=True - ), - "model.layers.*.self_attn.q_proj": ColwiseParallel(use_local_output=False), - "model.layers.*.self_attn.k_proj": ColwiseParallel(use_local_output=False), - "model.layers.*.self_attn.v_proj": ColwiseParallel(use_local_output=False), - "model.layers.*.self_attn.o_proj": RowwiseParallel( - output_layouts=Replicate(), use_local_output=True - ), - "model.layers.*.self_attn.rotary_emb": PrepareModuleOutput( - output_layouts=(Replicate(), Replicate()), - desired_output_layouts=(Replicate(), Replicate()), - use_local_output=False, - ), - "model.layers.*.mlp.up_proj": ColwiseParallel(), - "model.layers.*.mlp.gate_proj": ColwiseParallel(), - "model.layers.*.mlp.down_proj": RowwiseParallel( - output_layouts=Replicate(), use_local_output=True - ), - "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False), -} + +def get_custom_parallel_plan(): + # Reuse llama default parallel plan + base_model_tp_plan: dict[str, ParallelStyle] = { + "model.embed_tokens": RowwiseParallel(input_layouts=Replicate()), + "model.layers.*.self_attn.q_proj": ColwiseParallel(), + "model.layers.*.self_attn.k_proj": ColwiseParallel(), + "model.layers.*.self_attn.v_proj": ColwiseParallel(), + "model.layers.*.self_attn.o_proj": RowwiseParallel(), + "model.layers.*.mlp.up_proj": ColwiseParallel(), + "model.layers.*.mlp.gate_proj": ColwiseParallel(), + "model.layers.*.mlp.down_proj": RowwiseParallel(), + "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False), + } + + base_model_sp_plan = { + "model.embed_tokens": RowwiseParallel( + input_layouts=Replicate(), output_layouts=Shard(1) + ), + "model.norm": SequenceParallel(), + "model.layers.*.input_layernorm": SequenceParallel(), + "model.layers.*.self_attn.o_proj": RowwiseParallel(output_layouts=Shard(1)), + "model.layers.*.post_attention_layernorm": SequenceParallel(), + "model.layers.*.mlp.down_proj": RowwiseParallel(output_layouts=Shard(1)), + "lm_head": ColwiseParallel( + input_layouts=Shard(1), output_layouts=Shard(-1), use_local_output=False + ), + } + + if False: + # Enable sequence parallelism only if TP size > 1 + base_model_tp_plan.update(cast(dict[str, ParallelStyle], base_model_sp_plan)) + + return base_model_tp_plan + + +custom_parallel_plan: dict[str, ParallelStyle] = get_custom_parallel_plan() +# { + +# "model.embed_tokens": RowwiseParallel( +# input_layouts=Replicate(), output_layouts=Replicate(), use_local_output=True +# ), +# "model.layers.*.self_attn.q_proj": ColwiseParallel(use_local_output=False), +# "model.layers.*.self_attn.k_proj": ColwiseParallel(use_local_output=False), +# "model.layers.*.self_attn.v_proj": ColwiseParallel(use_local_output=False), +# "model.layers.*.self_attn.o_proj": RowwiseParallel( +# output_layouts=Replicate(), use_local_output=True +# ), +# "model.layers.*.self_attn.rotary_emb": PrepareModuleOutput( +# output_layouts=(Replicate(), Replicate()), +# desired_output_layouts=(Replicate(), Replicate()), +# use_local_output=False, +# ), +# "model.layers.*.mlp.up_proj": ColwiseParallel(), +# "model.layers.*.mlp.gate_proj": ColwiseParallel(), +# "model.layers.*.mlp.down_proj": RowwiseParallel( +# output_layouts=Replicate(), use_local_output=True +# ), +# "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False), +# } diff --git a/examples/configs/sft_nemotron_super_49b.yaml b/examples/configs/sft_nemotron_super_49b.yaml new file mode 100644 index 0000000000..d79837dbb8 --- /dev/null +++ b/examples/configs/sft_nemotron_super_49b.yaml @@ -0,0 +1,134 @@ +# SFT Algorithm Configuration +sft: + max_num_epochs: 3 + max_num_steps: 100 + val_period: 10 + val_batches: 8 + val_global_batch_size: 128 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 + +checkpointing: + enabled: true + checkpoint_dir: "results/sft_nemotron_super_49b" + metric_name: "val_loss" + higher_is_better: false + keep_top_k: 100 + save_period: 500 + checkpoint_must_save_by: null + +policy: + # model_name: Qwen/Qwen2.5-7B-Instruct + # tokenizer: + # name: Qwen/Qwen2.5-7B-Instruct + model_name: "/lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf" + tokenizer: + name: ${policy.model_name} + max_total_sequence_length: 4096 + precision: "bfloat16" + train_global_batch_size: 128 + train_micro_batch_size: 8 + + dtensor_cfg: + _v2: true + activation_checkpointing: true + context_parallel_size: 2 + cpu_offload: false + enabled: true + sequence_parallel: false + tensor_parallel_size: 4 + custom_parallel_plan: examples.configs.recipes.llm.llama_nemotron_super_49b_custom_plan.custom_parallel_plan + + megatron_cfg: + enabled: false + + dynamic_batching: + enabled: false + train_mb_tokens: 4096 + logprob_mb_tokens: 8192 + sequence_length_round: 64 + + sequence_packing: + enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${max:${mul:${policy.dtensor_cfg.context_parallel_size}, 2}, ${policy.max_total_sequence_length}} + max_grad_norm: null + + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: 2e-5 + weight_decay: 0.01 + betas: [0.9, 0.98] + eps: 1e-8 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False + +# data: +# add_bos: true +# add_eos: true +# add_generation_prompt: false +# dataset_name: "tulu3_sft_mixture" +# cache_dir: "/lustre/fsw/portfolios/coreai/users/gvenkatakris/data-cache" +# max_input_seq_length: 1024 +# max_samples: 10000 +# shuffle: true +# test_size: 0.05 + +data: + max_input_seq_length: ${policy.max_total_sequence_length} + add_bos: true + add_eos: true + add_generation_prompt: false + shuffle: true + num_workers: 20 + + dataset_name: "squad" + # You can use custom response datasets for training and validation. For example: + # data: + # dataset_name: ResponseDataset + # train_data_path: # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace) + # val_data_path: + # input_key: , default is "input" + # output_key: , default is "output" + # train_split: , default is None # used for HuggingFace datasets + # val_split: , default is None # used for HuggingFace datasets + # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details. + + ## unused with squad dataset + prompt_file: null + split: null + output_key: null + seed: null + +logger: + log_dir: "logs" # Base directory for all logs + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + tensorboard_enabled: false + mlflow_enabled: false + monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + swanlab_enabled: false + wandb: + project: "sft-nemotron-joyang" + name: "sft-${data.dataset_name}-nemotron-super-49b-joyang" + tensorboard: + log_dir: "tb_logs-openmathinstruct-nemorl-1M_train" + mlflow: + experiment_name: "sft-dev" + run_name: "openmathinstruct-nemorl-1M_train" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py index 51adfddced..eb73bc5a30 100644 --- a/examples/run_grpo_math.py +++ b/examples/run_grpo_math.py @@ -19,7 +19,7 @@ from typing import Any, Optional from omegaconf import OmegaConf -from transformers import PreTrainedTokenizerBase +from transformers import AutoConfig, PreTrainedTokenizerBase from nemo_rl.algorithms.grpo import MasterConfig, grpo_train, setup from nemo_rl.algorithms.utils import get_tokenizer @@ -41,6 +41,7 @@ from nemo_rl.utils.logger import get_next_experiment_dir OmegaConf.register_new_resolver("mul", lambda a, b: a * b) +OmegaConf.register_new_resolver("max", lambda a, b: max(a, b)) def parse_args() -> tuple[argparse.Namespace, list[str]]: @@ -158,7 +159,17 @@ def main() -> None: init_ray() - # setup tokenizer + # setup tokenizer and preloading model to force HF to download the model and modules + # to avoid race condition inside generation/policy workers. + try: + _ = AutoConfig.from_pretrained( + config["policy"]["model_name"], trust_remote_code=True + ) + print(f"Config preloaded successfully: {config['policy']['model_name']}") + except Exception as e: + print("WARNIN: error in preloading model, in general it's not a problem: ") + print(e) + tokenizer = get_tokenizer(config["policy"]["tokenizer"]) assert config["policy"]["generation"] is not None, ( "A generation config is required for GRPO" diff --git a/examples/run_sft.py b/examples/run_sft.py index b804b4e19f..bcda89e09c 100644 --- a/examples/run_sft.py +++ b/examples/run_sft.py @@ -32,6 +32,7 @@ from nemo_rl.utils.logger import get_next_experiment_dir OmegaConf.register_new_resolver("mul", lambda a, b: a * b) +OmegaConf.register_new_resolver("max", lambda a, b: max(a, b)) def parse_args(): diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py index 190f3c2921..fecbb884ce 100644 --- a/nemo_rl/algorithms/grpo.py +++ b/nemo_rl/algorithms/grpo.py @@ -793,7 +793,8 @@ def grpo_train( print("▶ Computing logprobs...", flush=True) with timer.time("policy_and_reference_logprobs"): - fprop_logprobs = policy.get_logprobs(train_data)["logprobs"] + logprobs_results = policy.get_logprobs(train_data) + fprop_logprobs = logprobs_results["logprobs"] reference_logprobs = policy.get_reference_policy_logprobs( train_data )["reference_logprobs"] @@ -915,12 +916,15 @@ def grpo_train( log_data, f"train_data_step{total_steps}.jsonl" ) + print(f"train_results: {train_results['train_max_seq_len']}") metrics = { "loss": train_results["loss"].numpy(), + "train_max_seq_len": train_results["train_max_seq_len"], "reward": rewards.numpy(), "grad_norm": train_results["grad_norm"].numpy(), "mean_prompt_length": repeated_batch["length"].numpy(), "total_num_tokens": input_lengths.numpy(), + "train_max_seq_len": train_results["train_max_seq_len"], } metrics.update(train_results["all_mb_metrics"]) for k, v in metrics.items(): diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py index c0572ce3a1..6591465120 100644 --- a/nemo_rl/data/llm_message_utils.py +++ b/nemo_rl/data/llm_message_utils.py @@ -552,29 +552,29 @@ def _format_content_helper( message_chunk = formatted_message[prev_message_len_no_eos:] # Debug: Print each message turn separately (only once for the first sample) - if not hasattr(get_formatted_message_log, "_debug_printed"): - if i == 0: - # Print header only at the start of first message - print("\n" + "=" * 80) - print("DEBUG: Individual message turns from apply_chat_template") - print("=" * 80) - - print(f"\n[Turn {i + 1}/{len(message_log_strs)}] Role: {message['role']}") - print("-" * 40) - print("Extracted message chunk:") - print(repr(message_chunk)) # Using repr to show special characters - print(f"Raw text (len={len(message_chunk)}):") - print(message_chunk) - print("-" * 40) - - if i == len(message_log_strs) - 1: - # Mark as printed after processing all turns of the first sample - get_formatted_message_log._debug_printed = True - print("\n" + "=" * 80) - print("DEBUG: Complete formatted conversation:") - print("-" * 80) - print(formatted_message) - print("=" * 80 + "\n") + # if not hasattr(get_formatted_message_log, "_debug_printed"): + # if i == 0: + # # Print header only at the start of first message + # print("\n" + "=" * 80) + # print("DEBUG: Individual message turns from apply_chat_template") + # print("=" * 80) + + # print(f"\n[Turn {i + 1}/{len(message_log_strs)}] Role: {message['role']}") + # print("-" * 40) + # print("Extracted message chunk:") + # print(repr(message_chunk)) # Using repr to show special characters + # print(f"Raw text (len={len(message_chunk)}):") + # print(message_chunk) + # print("-" * 40) + + # if i == len(message_log_strs) - 1: + # # Mark as printed after processing all turns of the first sample + # get_formatted_message_log._debug_printed = True + # print("\n" + "=" * 80) + # print("DEBUG: Complete formatted conversation:") + # print("-" * 80) + # print(formatted_message) + # print("=" * 80 + "\n") if i == 0: if add_bos_token: diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index ed919579b7..298ae7e7f6 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -18,8 +18,10 @@ import warnings from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext +from functools import wraps from typing import Any, Generator, Iterable, Optional, cast +import humanize import ray import torch from accelerate import init_empty_weights @@ -44,6 +46,7 @@ get_cpu_state_dict, to_local_if_dtensor, ) +from tabulate import tabulate from torch import nn from torch.distributed.checkpoint.state_dict import ( StateDictOptions, @@ -92,6 +95,26 @@ from nemo_rl.utils.nsys import wrap_with_nvtx_name +def mem_stats(func): + @wraps(func) + def wrapper(*args, **kwargs): + torch.cuda.reset_peak_memory_stats() + ret = func(*args, **kwargs) + allocated = humanize.naturalsize(torch.cuda.memory_allocated()) + reserved = humanize.naturalsize(torch.cuda.memory_reserved()) + peak_allocated = humanize.naturalsize(torch.cuda.max_memory_allocated()) + peak_reserved = humanize.naturalsize(torch.cuda.max_memory_reserved()) + + headers = ["Allocated", "Peak Allocated", "Reserved", "Peak Reserved"] + row = [[allocated, peak_allocated, reserved, peak_reserved]] + + print("Memory stats:") + print(tabulate(row, headers=headers, tablefmt="grid")) + return ret + + return wrapper + + @ray.remote( runtime_env=get_runtime_env_for_policy_worker("dtensor_policy_worker_v2") ) # pragma: no cover @@ -180,6 +203,8 @@ def __init__( else None, ) + # model_config.num_hidden_layers = 2 + self.allow_flash_attn_args = self.check_model_allow_flash_attn_args( model_config ) @@ -245,10 +270,7 @@ def __init__( # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180 self.model = model_class.from_config( model_config, - attn_implementation="flash_attention_2" - if self.enable_seq_packing - else None, - use_liger_kernel=False, + attn_implementation="sdpa", trust_remote_code=True, torch_dtype=str(model_config.torch_dtype), ) @@ -488,6 +510,7 @@ def get_gpu_info(self) -> dict[str, Any]: """Return information about the GPU being used by this worker.""" return get_gpu_info(self.model) + @mem_stats @wrap_with_nvtx_name("dtensor_policy_worker_v2/train") def train( self, @@ -498,6 +521,8 @@ def train( mbs: Optional[int] = None, ) -> dict[str, Any]: """Train the policy on a batch of data with a given loss function.""" + max_seq_len_in_step = 0 + if gbs is None: gbs = self.cfg["train_global_batch_size"] if mbs is None: @@ -657,6 +682,8 @@ def train( if len(vlm_kwargs) > 0: position_ids = None + max_seq_len_in_step = max(max_seq_len_in_step, seq_len) + context_parallel_ctx = None if self.cp_size > 1: assert len(vlm_kwargs) == 0, ( @@ -859,6 +886,7 @@ def train( "gpu_name": torch.cuda.get_device_name(), "model_dtype": self.dtype, "all_mb_metrics": dict(mb_metrics), + "train_max_seq_len": max_seq_len_in_step, } return metrics @@ -880,6 +908,7 @@ def get_logprobs( We use the convention that the logprob of the first token is 0 so that the sequence length is maintained. The logprob of input token i is specified at position i in the output logprobs tensor. """ + max_seq_len_in_step = 0 logprob_batch_size = ( micro_batch_size if micro_batch_size is not None @@ -986,6 +1015,8 @@ def get_logprobs( if len(vlm_kwargs) > 0: position_ids = None + max_seq_len_in_step = max(max_seq_len_in_step, seq_len) + context_parallel_ctx = None if self.cp_size > 1: assert len(vlm_kwargs) == 0, ( @@ -1167,6 +1198,9 @@ def get_logprobs( ) all_log_probs_padded.append(lp) return_data["logprobs"] = torch.cat(all_log_probs_padded, dim=0).cpu() + print(f"get_logprobs: max_seq_len_in_step: {max_seq_len_in_step}") + + # return_data["logprobs_max_seq_len"] = max_seq_len_in_step return return_data diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py index 23746a8037..c2b1132f52 100644 --- a/nemo_rl/models/policy/lm_policy.py +++ b/nemo_rl/models/policy/lm_policy.py @@ -430,6 +430,7 @@ def train( aggregated_results = { "loss": results[0]["global_loss"], "grad_norm": results[0]["grad_norm"], + "train_max_seq_len": max([r["train_max_seq_len"] for r in results]), } if self.flops_tracker is not None: diff --git a/pyproject.toml b/pyproject.toml index 69d3d9fea4..9260c89ea9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,8 @@ dependencies = [ "mlflow", "nvidia-nvshmem-cu12", # for deep_ep build "swanlab", + "humanize", + "tabulate", ] [project.optional-dependencies] @@ -202,11 +204,12 @@ requires-dist = ["torch", "einops", "setuptools", "psutil", "ninja"] [tool.black] line-length = 120 include = '\.pyi?$' -exclude = ''' +force-exclude = ''' /( \.git | \.venv | build + | nemo_rl/models/dtensor/custom_models )/ ''' diff --git a/uv.lock b/uv.lock index 50e2abc88f..37e23d311c 100644 --- a/uv.lock +++ b/uv.lock @@ -1714,6 +1714,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" }, ] +[[package]] +name = "humanize" +version = "4.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/1d/3062fcc89ee05a715c0b9bfe6490c00c576314f27ffee3a704122c6fd259/humanize-4.13.0.tar.gz", hash = "sha256:78f79e68f76f0b04d711c4e55d32bebef5be387148862cb1ef83d2b58e7935a0", size = 81884, upload-time = "2025-08-25T09:39:20.04Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/c7/316e7ca04d26695ef0635dc81683d628350810eb8e9b2299fc08ba49f366/humanize-4.13.0-py3-none-any.whl", hash = "sha256:b810820b31891813b1673e8fec7f1ed3312061eab2f26e3fa192c393d11ed25f", size = 128869, upload-time = "2025-08-25T09:39:18.54Z" }, +] + [[package]] name = "hydra-core" version = "1.3.2" @@ -2901,6 +2910,7 @@ dependencies = [ { name = "colored" }, { name = "datasets" }, { name = "debugpy" }, + { name = "humanize" }, { name = "hydra-core" }, { name = "math-verify" }, { name = "matplotlib" }, @@ -2919,6 +2929,7 @@ dependencies = [ { name = "setuptools" }, { name = "swanlab" }, { name = "sympy" }, + { name = "tabulate" }, { name = "tensorboard" }, { name = "tiktoken" }, { name = "torch", version = "2.7.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, @@ -3009,6 +3020,7 @@ requires-dist = [ { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.7.4.post1" }, { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.7.4.post1" }, { name = "flash-attn", marker = "extra == 'vllm'", specifier = "==2.7.4.post1" }, + { name = "humanize" }, { name = "hydra-core" }, { name = "mamba-ssm", marker = "extra == 'automodel'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, { name = "mamba-ssm", marker = "extra == 'vllm'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, @@ -3033,6 +3045,7 @@ requires-dist = [ { name = "setuptools" }, { name = "swanlab" }, { name = "sympy", specifier = ">=1.14.0" }, + { name = "tabulate" }, { name = "tensorboard" }, { name = "tiktoken" }, { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" }, @@ -5617,6 +5630,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "tensorboard" version = "2.20.0"