InfiniTensor · ArcaLunar · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 23, 2026
diff --git a/example/common/utils.cc b/example/common/utils.cc
@@ -1,5 +1,10 @@
 #include "example/common/utils.h"
 
+#include "gflags/gflags.h"
+#include "gflags/gflags_declare.h"
+#include "glog/logging.h"
+#include "infini_train/include/nn/parallel/global.h"
+
 namespace infini_train {
 
 float ConvertBF16ToFloat(void *ptr) {
@@ -61,4 +66,53 @@ void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t s
     ifs.seekg(base + std::streamoff(len * sizeof(float)));
 }
 
+std::tuple<int, float, size_t> ResumeFromCheckpoint(
+    const fLS::clstring &flag_resume_root, // resume from this checkpoint directory
+    const nn::parallel::Rank &rank,        // rank info for distributed training
+    std::shared_ptr<nn::Module> model,     // model to be loaded with checkpoint state
+    std::shared_ptr<Optimizer> optimizer,  // some optimizer may not have state, but others may have
+    DistributedDataLoader &train_loader,   // distributed dataloader to be resumed
+    TrainerState &state,                   // trainer state to be loaded from checkpoint
+    DataLoaderIterator
+        &train_iter, // dataloader iterator to be set to the correct position according to checkpoint state
+    CheckpointLoadOptions model_bin_loader) {
+    int global_step = 0;
+    float best_loss = std::numeric_limits<float>::infinity();
+    size_t data_batch_idx = 0;
+
+    int ddp_world_size = nn::parallel::global::GetDataParallelSize();
+
+    if (flag_resume_root.empty()) {
+        LOG(INFO) << "No checkpoint specified for resume. Starting training from scratch.";
+        return {global_step, best_loss, data_batch_idx};
+    }
+
+    std::filesystem::path resume_dir = flag_resume_root;
+    if (rank.IsParallel()) {
+        const auto rank_dir = resume_dir / std::format("rank_{:06d}", rank.GlobalRank());
+        if (std::filesystem::exists(rank_dir)) {
+            resume_dir = rank_dir;
+        }
+    }
+
+    Checkpoint::Load(resume_dir, model.get(), optimizer.get(), &state, model_bin_loader);
+
+    global_step = static_cast<int>(state.global_step);
+    best_loss = state.best_loss;
+    if (state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && rank.IsMainRank()) {
+        LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
+                                    "Proceeding with recorded data_batch_idx {}.",
+                                    state.data_batch_stride, ddp_world_size, state.data_batch_idx);
+    }
+    data_batch_idx = static_cast<size_t>(std::max<int64_t>(state.data_batch_idx, 0));
+    train_iter = train_loader.IteratorAtBatchIndex(data_batch_idx);
+    if (rank.IsMainRank()) {
+        LOG(INFO) << std::format(
+            "Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}", state.global_step,
+            state.best_loss, state.last_lr, state.data_batch_idx);
+    }
+
+    return {global_step, best_loss, data_batch_idx};
+}
+
 } // namespace infini_train
diff --git a/example/common/utils.h b/example/common/utils.h
@@ -1,8 +1,19 @@
 #pragma once
 
+#include "infini_train/include/checkpoint.h"
+#include "infini_train/include/dataloader.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/rank.h"
+#include "infini_train/include/optimizer.h"
+
+#include "gflags/gflags.h"
+
 #include <cstdint>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
+#include <functional>
+#include <tuple>
 #include <vector>
 
 namespace infini_train {
@@ -30,4 +41,19 @@ void ReadVectorAllFloat(std::ifstream &ifs, float *dst, int64_t len);
 
 void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t start, int64_t cnt);
 
+/**
+ * @returns a tuple of (global_step, best_loss, data_batch_idx) loaded from the checkpoint, which can be used to resume
+ * training.
+ */
+std::tuple<int, float, size_t> ResumeFromCheckpoint(
+    const fLS::clstring &flag_resume_root, // resume from this checkpoint directory
+    const nn::parallel::Rank &rank,        // rank info for distributed training
+    std::shared_ptr<nn::Module> model,     // model to be loaded with checkpoint state
+    std::shared_ptr<Optimizer> optimizer,  // some optimizer may not have state, but others may have
+    DistributedDataLoader &train_loader,   // distributed dataloader to be resumed
+    TrainerState &state,                   // trainer state to be loaded from checkpoint
+    DataLoaderIterator
+        &train_iter, // dataloader iterator to be set to the correct position according to checkpoint state
+    CheckpointLoadOptions model_bin_loader);
+
 } // namespace infini_train
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -1,15 +1,20 @@
+#include <algorithm>
 #include <chrono>
 #include <cstdlib>
+#include <filesystem>
 #include <format>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <unordered_map>
 #include <unordered_set>
 
+#include "example/common/utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
 #include "infini_train/include/autocast.h"
+#include "infini_train/include/checkpoint.h"
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dataloader.h"
 #include "infini_train/include/device.h"
@@ -75,6 +80,12 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
+DEFINE_uint32(save_steps, 0, "save checkpoint every N steps; 0 disables saving");
+DEFINE_string(resume_from, "", "checkpoint directory to resume from");
+DEFINE_string(checkpoint_dir, "./checkpoints", "root directory used to store checkpoints");
+DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
+DEFINE_bool(save_optimizer_state, true, "whether optimizer state is persisted in checkpoints");
+DEFINE_string(checkpoint_format, "bin", "checkpoint format: bin|pth");
 // precision check
 DEFINE_string(
     precision_check, "",
@@ -198,6 +209,8 @@ void Train(const nn::parallel::Rank &rank) {
     } else {
         model = GPT2::FromPretrained(kStrToModelType.at(FLAGS_model));
     }
+    auto llmc_model = std::dynamic_pointer_cast<GPT2>(model);
+    CHECK(llmc_model != nullptr) << "Failed to cast model to GPT2 for LLMC checkpoint I/O.";
 
     model->To(device);
 
@@ -311,6 +324,7 @@ void Train(const nn::parallel::Rank &rank) {
     }
 
     auto train_iter = train_loader.begin();
+    size_t saved_data_batch_idx = train_iter.BatchIndex();
     std::shared_ptr<nn::Module> loss_fn
         = (tp_world_size > 1) ? std::static_pointer_cast<nn::Module>(
               std::make_shared<VocabParallelCrossEntropyLoss>(model_config.original_vocab_size))
@@ -320,9 +334,69 @@ void Train(const nn::parallel::Rank &rank) {
 
     auto impl = core::GetDeviceGuardImpl(device.type());
 
-    LOG(INFO) << "start training";
+    int start_step = 0;
+    float best_loss = std::numeric_limits<float>::infinity();
+    TrainerState state;
+    CheckpointLoadOptions load_options;
+    load_options.load_optimizer_state = true;
+    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
+        auto loaded_model = GPT2::FromLLMC(model_path.string());
+        target_model->LoadStateDict(loaded_model->StateDict());
+    };
+    std::tie(start_step, best_loss, saved_data_batch_idx) = infini_train::ResumeFromCheckpoint(
+        FLAGS_resume_from, rank, model, optimizer, train_loader, state, train_iter, load_options);
+
+    auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
+                               bool prune_step_checkpoints) {
+        const auto ckpt_start = std::chrono::high_resolution_clock::now();
+
+        TrainerState state;
+        state.global_step = global_step;
+        state.data_batch_idx = saved_data_batch_idx;
+        state.data_batch_stride = ddp_world_size;
+        state.best_loss = best_loss;
+        state.last_lr = FLAGS_learning_rate;
+        state.optimizer_type = "SGD";
+        state.checkpoint_format = FLAGS_checkpoint_format;
+        state.ddp_size = ddp_world_size;
+        state.tp_size = tp_world_size;
+        state.sp_size = sp_world_size;
+        state.pp_size = pp_world_size;
+
+        CheckpointOptions options;
+        options.format = FLAGS_checkpoint_format;
+        options.save_optimizer_state = FLAGS_save_optimizer_state;
+        options.model_bin_writer = [&](const nn::Module &, const std::filesystem::path &model_path) {
+            llmc_model->SaveAsLLMC(model_path.string());
+        };
+        Checkpoint::Save(save_dir, *model, *optimizer, state, options);
+
+        const auto ckpt_end = std::chrono::high_resolution_clock::now();
+        const double ckpt_ms = std::chrono::duration<double, std::milli>(ckpt_end - ckpt_start).count();
+
+        if (rank.IsMainRank()) {
+            LOG(INFO) << std::format("Checkpoint saved at: {} ({:.2f} ms)", save_dir.string(), ckpt_ms);
+
+            if (prune_step_checkpoints) {
+                std::vector<std::filesystem::path> ckpts;
+                const auto root = std::filesystem::path(FLAGS_checkpoint_dir);
+                if (std::filesystem::exists(root)) {
+                    for (const auto &entry : std::filesystem::directory_iterator(root)) {
+                        if (entry.is_directory() && entry.path().filename().string().starts_with("checkpoint_step_")) {
+                            ckpts.push_back(entry.path());
+                        }
+                    }
+                    std::sort(ckpts.begin(), ckpts.end());
+                    while (ckpts.size() > FLAGS_max_checkpoint_keep) {
+                        std::filesystem::remove_all(ckpts.front());
+                        ckpts.erase(ckpts.begin());
+                    }
+                }
+            }
+        }
+    };
 
-    for (int step = 0; step < FLAGS_num_iteration + 1; ++step) {
+    for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) {
         // Reset precision check counters at start of each iteration for file overwrite
         utils::PrecisionChecker::ResetCounters();
 
@@ -372,6 +446,7 @@ void Train(const nn::parallel::Rank &rank) {
                 // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
                 // TODO(dcj): support dataloader.reset() later
                 ++train_iter;
+                saved_data_batch_idx = train_iter.BatchIndex();
                 x = std::make_shared<Tensor>(x->To(device));
                 y = std::make_shared<Tensor>(y->To(device));
 
@@ -401,6 +476,7 @@ void Train(const nn::parallel::Rank &rank) {
             // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
             // TODO(dcj): support dataloader.reset() later
             ++train_iter;
+            saved_data_batch_idx = train_iter.BatchIndex();
             x = std::make_shared<Tensor>(x->To(device));
             y = std::make_shared<Tensor>(y->To(device));
 
@@ -413,6 +489,8 @@ void Train(const nn::parallel::Rank &rank) {
             lossf = static_cast<const float *>(lossf_tensor->To(Device()).DataPtr())[0];
         }
 
+        best_loss = std::min(best_loss, lossf);
+
         const auto iter_end = std::chrono::high_resolution_clock::now();
         const double duration_us = std::chrono::duration<double, std::micro>(iter_end - iter_start).count();
         const double tps = FLAGS_total_batch_size / (duration_us / 1e6);
@@ -435,8 +513,22 @@ void Train(const nn::parallel::Rank &rank) {
                 }
             }
         }
+
+        if (FLAGS_save_steps > 0 && (step + 1) % FLAGS_save_steps == 0) {
+            std::filesystem::path step_dir
+                = std::filesystem::path(FLAGS_checkpoint_dir) / std::format("checkpoint_step_{:06d}", step + 1);
+            if (rank.IsParallel()) {
+                step_dir /= std::format("rank_{:06d}", rank.GlobalRank());
+            }
+            save_checkpoint(step_dir, step + 1, true);
+        }
     }
 
+    std::filesystem::path final_dir = std::filesystem::path(FLAGS_checkpoint_dir) / "checkpoint_final";
+    if (rank.IsParallel()) {
+        final_dir /= std::format("rank_{:06d}", rank.GlobalRank());
+    }
+    save_checkpoint(final_dir, FLAGS_num_iteration, false);
     // Save LoRA weights if enabled and path specified
     if (lora_enabled && !FLAGS_lora_save_path.empty()) {
         LOG(INFO) << "Saving LoRA weights to: " << FLAGS_lora_save_path;

diff --git a/example/gpt2/net.cc b/example/gpt2/net.cc
@@ -719,4 +719,112 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
     return local_gpt2;
 }
 
+void GPT2::SaveAsLLMC(const std::string &filepath) const {
+    CHECK_EQ(nn::parallel::global::GetTensorParallelSize(), 1) << "SaveAsLLMC currently supports TP=1 only.";
+    CHECK_EQ(nn::parallel::global::GetPipelineParallelSize(), 1) << "SaveAsLLMC currently supports PP=1 only.";
+
+    std::ofstream ofs(filepath, std::ios::binary);
+    CHECK(ofs.is_open()) << "Failed to open model file for write: " << filepath;
+
+    std::vector<int32_t> header(256, 0);
+    header[0] = kHeaderMagic;
+    header[1] = kHeaderFP32Version;
+    header[2] = static_cast<int32_t>(config_.block_size);
+    header[3] = static_cast<int32_t>(config_.original_vocab_size);
+    header[4] = static_cast<int32_t>(config_.n_layer);
+    header[5] = static_cast<int32_t>(config_.n_head);
+    header[6] = static_cast<int32_t>(config_.n_embd);
+    header[7] = static_cast<int32_t>(config_.vocab_size);
+    ofs.write(reinterpret_cast<const char *>(header.data()),
+              static_cast<std::streamsize>(header.size() * sizeof(int32_t)));
+
+    const auto state_dict = StateDict();
+    auto get_tensor = [&](const std::string &name) -> std::shared_ptr<Tensor> {
+        CHECK(state_dict.contains(name)) << "Missing tensor in GPT2 state_dict: " << name;
+        return state_dict.at(name);
+    };
+
+    auto write_tensor_fp32 = [&](const std::shared_ptr<Tensor> &tensor) {
+        Tensor cpu = tensor->To(Device());
+        if (cpu.Dtype() != DataType::kFLOAT32) {
+            cpu = cpu.To(DataType::kFLOAT32);
+        }
+        const auto bytes = static_cast<std::streamsize>(cpu.SizeInBytes());
+        ofs.write(reinterpret_cast<const char *>(cpu.DataPtr()), bytes);
+    };
+
+    // transformer.wte.weight
+    write_tensor_fp32(get_tensor(std::format("{}.{}.{}", kTransformerLayerName, GPT2FirstStage::kWTELayerName,
+                                             nn::parallel::VocabParallelEmbedding::kParamWeightName)));
+
+    // transformer.wpe.weight
+    write_tensor_fp32(get_tensor(std::format("{}.{}.{}", kTransformerLayerName, GPT2FirstStage::kWPELayerName,
+                                             nn::Embedding::kParamWeightName)));
+
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn1LayerName, nn::LayerNorm::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn1LayerName, nn::LayerNorm::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn2LayerName, nn::LayerNorm::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
+                                                 Block::kLn2LayerName, nn::LayerNorm::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCFcLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCFcLayerName,
+                                                 nn::parallel::ColumnParallelLinear::kParamBiasName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamWeightName)));
+    }
+    for (int idx = 0; idx < config_.n_layer; ++idx) {
+        write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
+                                                 idx, Block::kMlpLayerName, MLP::kCProjLayerName,
+                                                 nn::parallel::RowParallelLinear::kParamBiasName)));
+    }
+
+    write_tensor_fp32(get_tensor(
+        std::format("{}.{}.{}", kTransformerLayerName, GPT2LastStage::kLnFLayerName, nn::LayerNorm::kParamWeightName)));
+    write_tensor_fp32(get_tensor(
+        std::format("{}.{}.{}", kTransformerLayerName, GPT2LastStage::kLnFLayerName, nn::LayerNorm::kParamBiasName)));
+
+    ofs.flush();
+    CHECK(ofs.good()) << "Failed to flush model file: " << filepath;
+}
+
 int GPT2::GetChunkSize() const { return stage_info_.layer_ranges_per_chunk.size(); }
diff --git a/example/gpt2/net.h b/example/gpt2/net.h
@@ -141,6 +141,7 @@ class GPT2 : public infini_train::nn::CloneableModule<GPT2> {
 
     static std::shared_ptr<GPT2> FromPretrained(ModelType model_type);
     static std::shared_ptr<GPT2> FromLLMC(const std::string &filepath);
+    void SaveAsLLMC(const std::string &filepath) const;
 
     int GetChunkSize() const;