facebookresearch · provos · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
 ]
 dependencies = [
     "timm>=1.0.17",
-    "numpy==1.26",
+    "numpy>=1.26",
     "tqdm",
     "ftfy==6.1.1",
     "regex",
@@ -59,7 +59,7 @@ notebooks = [
     "ipycanvas",
     "ipympl",
     "pycocotools",
-    "decord",
+    "decord2",
     "opencv-python",
     "einops",
     "scikit-image",

diff --git a/sam3/eval/postprocessors.py b/sam3/eval/postprocessors.py
@@ -150,9 +150,13 @@ def _process_masks(self, target_sizes, pred_masks, consistent=True, keep=None):
         if pred_masks is None:
             return None
         if self.always_interpolate_masks_on_gpu:
-            gpu_device = target_sizes.device
-            assert gpu_device.type == "cuda"
-            pred_masks = pred_masks.to(device=gpu_device)
+            device = target_sizes.device
+            if device.type == "cpu":
+                logging.warning(
+                    "always_interpolate_masks_on_gpu=True but data is on CPU; "
+                    "falling back to CPU interpolation"
+                )
+            pred_masks = pred_masks.to(device=device)
         if consistent:
             assert keep is None, "TODO: implement?"
             # All masks should have the same shape, expected when processing a batch of size 1
@@ -454,9 +458,13 @@ def process_results(
             ]  # [P,Q,...] --> [K,...]
             meta_td = meta_td[tracked_obj_ids_idx[PROMPT_AXIS].cpu()]
             if self.always_interpolate_masks_on_gpu:
-                gpu_device = meta_td["original_size"].device
-                assert gpu_device.type == "cuda"
-                tracked_objs_outs_td = tracked_objs_outs_td.to(device=gpu_device)
+                device = meta_td["original_size"].device
+                if device.type == "cpu":
+                    logging.warning(
+                        "always_interpolate_masks_on_gpu=True but data is on CPU; "
+                        "falling back to CPU interpolation"
+                    )
+                tracked_objs_outs_td = tracked_objs_outs_td.to(device=device)
             frame_results_td = self(
                 tracked_objs_outs_td.unsqueeze(1),
                 (

diff --git a/sam3/model/decoder.py b/sam3/model/decoder.py
@@ -277,8 +277,9 @@ def __init__(
 
             if resolution is not None and stride is not None:
                 feat_size = resolution // stride
+                device = "cuda" if torch.cuda.is_available() else "cpu"
                 coords_h, coords_w = self._get_coords(
-                    feat_size, feat_size, device="cuda"
+                    feat_size, feat_size, device=device
                 )
                 self.compilable_cord_cache = (coords_h, coords_w)
                 self.compilable_stored_size = (feat_size, feat_size)

diff --git a/sam3/model/geometry_encoders.py b/sam3/model/geometry_encoders.py
@@ -10,7 +10,7 @@
 from .act_ckpt_utils import activation_ckpt_wrapper
 from .box_ops import box_cxcywh_to_xyxy
 
-from .model_misc import get_clones
+from .model_misc import get_clones, tensor_to_device
 
 
 def is_right_padded(mask):
@@ -656,7 +656,7 @@ def _encode_boxes(self, boxes, boxes_mask, boxes_labels, img_feats):
             # We need to denormalize, and convert to [x, y, x, y]
             boxes_xyxy = box_cxcywh_to_xyxy(boxes)
             scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype)
-            scale = scale.pin_memory().to(device=boxes_xyxy.device, non_blocking=True)
+            scale = tensor_to_device(scale, boxes_xyxy.device)
             scale = scale.view(1, 1, 4)
             boxes_xyxy = boxes_xyxy * scale
             sampled = torchvision.ops.roi_align(