Fix checkpoint compatibility for upsampling_layers (#314)

hummuscience · Muad Abd El Hay · claude · web-flow · commit cef3f9265601 · 2025-09-10T11:15:29.000-04:00
* Fix checkpoint compatibility for upsampling_layers Older checkpoints may have 'upsampling_layers' parameters without the 'head.' prefix, causing warnings when loading models after head refactoring. This fix remaps these keys during checkpoint loading to ensure backwards compatibility. Fixes warning: "Found keys that are not in the model state dict but in the checkpoint: ['upsampling_layers.1.weight', 'upsampling_layers.1.bias', ...]" 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Add fallback mechanism for checkpoint loading with weights_only=False - Implement try/catch blocks around torch.load() calls in three files - First attempts standard loading, falls back to weights_only=False on failure - Provides clear warning messages when fallback is used - Resolves pickle deserialization errors with older checkpoints - Maintains security by attempting safer method first 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * add more transformer tests --------- Co-authored-by: Muad Abd El Hay <abdelhaym@ESI-nbFRI070.ESI.local> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Muad Abd El Hay <abdelhaym@ESI-nbFRI091.ESI.local> Co-authored-by: themattinthehatt <themattinthehatt@gmail.com>
diff --git a/lightning_pose/models/backbones/vits.py b/lightning_pose/models/backbones/vits.py
@@ -63,7 +63,13 @@ def build_backbone(backbone_arch: str, image_size: int = 256, **kwargs):
 
 def load_vit_backbone_checkpoint(base, checkpoint: str):
     print(f"Loading VIT-MAE weights from {checkpoint}")
-    ckpt_vit_pretrain = torch.load(checkpoint, map_location="cpu")
+    # Try loading with default settings first, fallback to weights_only=False if needed
+    try:
+        ckpt_vit_pretrain = torch.load(checkpoint, map_location="cpu")
+    except Exception as e:
+        print(f"Warning: Failed to load checkpoint with default settings: {e}")
+        print("Attempting to load with weights_only=False...")
+        ckpt_vit_pretrain = torch.load(checkpoint, map_location="cpu", weights_only=False)
     # extract state dict if checkpoint contains additional info
     if "state_dict" in ckpt_vit_pretrain:
         ckpt_vit_pretrain = ckpt_vit_pretrain["state_dict"]
diff --git a/lightning_pose/utils/predictions.py b/lightning_pose/utils/predictions.py
@@ -559,49 +559,55 @@ def load_model_from_checkpoint(
         map_type=cfg.model.model_type,
         semi_supervised=semi_supervised,
     )
-    # initialize a model instance, with weights loaded from .ckpt file
-    if cfg.model.backbone == "vitb_sam":
-        # see https://github.com/paninski-lab/lightning-pose/issues/134 for explanation of this block
-        from lightning_pose.utils.scripts import get_model
-
-        # load model first
-        model = get_model(
-            cfg,
-            data_module=data_module,
-            loss_factories=loss_factories,
+
+    # initialize a model instance, load weights from .ckpt file (fix state_dict keys if needed)
+    try:
+        checkpoint = torch.load(ckpt_file)
+    except Exception as e:
+        print(f"Warning: Failed to load checkpoint with default settings: {e}")
+        print("Attempting to load with weights_only=False...")
+        checkpoint = torch.load(ckpt_file, weights_only=False)
+    state_dict = checkpoint.get("state_dict", checkpoint)
+
+    # fix state dict key mismatch for upsampling layers
+    # old checkpoints may have 'upsampling_layers' without 'head.' prefix
+    keys_remapped = False
+    for key in list(state_dict.keys()):
+        if key.startswith("upsampling_layers."):
+            # Add 'head.' prefix if missing
+            new_key = "head." + key
+            state_dict[new_key] = state_dict.pop(key)
+            keys_remapped = True
+
+    if keys_remapped:
+        # save the fixed state dict back to checkpoint
+        checkpoint["state_dict"] = state_dict
+        # create a temporary file with the fixed checkpoint
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix='.ckpt', delete=False) as tmp_file:
+            torch.save(checkpoint, tmp_file.name)
+            fixed_ckpt_file = tmp_file.name
+    else:
+        fixed_ckpt_file = ckpt_file
+
+    if semi_supervised:
+        model = ModelClass.load_from_checkpoint(
+            fixed_ckpt_file,
+            loss_factory=loss_factories["supervised"],
+            loss_factory_unsupervised=loss_factories["unsupervised"],
+            strict=False,
         )
-        # # update model parameter
-        # if model.backbone.pos_embed is not None:
-        #     # re-initialize absolute positional embedding with *finetune* image size.
-        #     finetune_img_size = cfg.data.image_resize_dims.height
-        #     patch_size = model.backbone.patch_size
-        #     embed_dim = 768  # value from lightning_pose.models.backbones.vits.build_backbone
-        #     model.backbone.pos_embed = torch.nn.Parameter(
-        #         torch.zeros(
-        #             1,
-        #             finetune_img_size // patch_size,
-        #             finetune_img_size // patch_size,
-        #             embed_dim,
-        #         )
-        #     )
-        # load weights
-        state_dict = torch.load(ckpt_file)["state_dict"]
-        # put weights into model
-        model.load_state_dict(state_dict, strict=False)
     else:
-        if semi_supervised:
-            model = ModelClass.load_from_checkpoint(
-                ckpt_file,
-                loss_factory=loss_factories["supervised"],
-                loss_factory_unsupervised=loss_factories["unsupervised"],
-                strict=False,
-            )
-        else:
-            model = ModelClass.load_from_checkpoint(
-                ckpt_file,
-                loss_factory=loss_factories["supervised"],
-                strict=False,
-            )
+        model = ModelClass.load_from_checkpoint(
+            fixed_ckpt_file,
+            loss_factory=loss_factories["supervised"],
+            strict=False,
+        )
+
+    # clean up temporary file if created
+    if keys_remapped:
+        import os
+        os.unlink(fixed_ckpt_file)
 
     if eval:
         model.eval()
diff --git a/lightning_pose/utils/scripts.py b/lightning_pose/utils/scripts.py
@@ -489,7 +489,13 @@ def get_model(
         if not ckpt.endswith(".ckpt"):
             import glob
             ckpt = glob.glob(os.path.join(ckpt, "**", "*.ckpt"), recursive=True)[0]
-        state_dict = torch.load(ckpt)["state_dict"]
+        # Try loading with default settings first, fallback to weights_only=False if needed
+        try:
+            state_dict = torch.load(ckpt)["state_dict"]
+        except Exception as e:
+            print(f"Warning: Failed to load checkpoint with default settings: {e}")
+            print("Attempting to load with weights_only=False...")
+            state_dict = torch.load(ckpt, weights_only=False)["state_dict"]
         # try loading all weights
         try:
             model.load_state_dict(state_dict, strict=False)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -523,8 +523,14 @@ def video_dataloader(cfg, base_dataset, video_list) -> LitDaliWrapper:
 def trainer(cfg) -> pl.Trainer:
     """Create a basic pytorch lightning trainer for testing models."""
 
-    cfg.training.unfreezing_epoch = 1 # exercise unfreezing
-    callbacks = get_callbacks(cfg, early_stopping=False, lr_monitor=False, backbone_unfreeze=True, checkpointing=False)
+    cfg.training.unfreezing_epoch = 1  # exercise unfreezing
+    callbacks = get_callbacks(
+        cfg,
+        early_stopping=False,
+        lr_monitor=False,
+        backbone_unfreeze=True,
+        checkpointing=False,
+    )
 
     trainer = pl.Trainer(
         accelerator="gpu",
@@ -534,7 +540,7 @@ def trainer(cfg) -> pl.Trainer:
         check_val_every_n_epoch=1,
         log_every_n_steps=1,
         callbacks=callbacks,
-        enable_checkpointing = False,
+        enable_checkpointing=False,
         limit_train_batches=2,
         num_sanity_val_steps=0,
         logger=False,
diff --git a/tests/models/test_heatmap_tracker.py b/tests/models/test_heatmap_tracker.py
@@ -24,7 +24,7 @@ def test_supervised_heatmap(
     )
 
 
-def test_supervised_heatmap_vit_sam(
+def test_supervised_heatmap_vitb_sam(
     cfg,
     heatmap_data_module,
     video_dataloader,
@@ -46,6 +46,50 @@ def test_supervised_heatmap_vit_sam(
     )
 
 
+def test_supervised_heatmap_vitb_imagenet(
+    cfg,
+    heatmap_data_module,
+    video_dataloader,
+    trainer,
+    run_model_test,
+):
+    """Test the initialization and training of a supervised heatmap model."""
+
+    cfg_tmp = copy.deepcopy(cfg)
+    cfg_tmp.model.model_type = "heatmap"
+    cfg_tmp.model.backbone = "vitb_imagenet"
+    cfg_tmp.model.losses_to_use = []
+
+    run_model_test(
+        cfg=cfg_tmp,
+        data_module=heatmap_data_module,
+        video_dataloader=video_dataloader,
+        trainer=trainer,
+    )
+
+
+def test_supervised_heatmap_vits_dino(
+    cfg,
+    heatmap_data_module,
+    video_dataloader,
+    trainer,
+    run_model_test,
+):
+    """Test the initialization and training of a supervised heatmap model."""
+
+    cfg_tmp = copy.deepcopy(cfg)
+    cfg_tmp.model.model_type = "heatmap"
+    cfg_tmp.model.backbone = "vits_dino"
+    cfg_tmp.model.losses_to_use = []
+
+    run_model_test(
+        cfg=cfg_tmp,
+        data_module=heatmap_data_module,
+        video_dataloader=video_dataloader,
+        trainer=trainer,
+    )
+
+
 def test_supervised_multiview_heatmap(
     cfg_multiview,
     multiview_heatmap_data_module,