pytorch
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_autograd.py‎
Lines changed: 16 additions & 14 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_autograd.py‎
Lines changed: 16 additions & 14 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py‎
Lines changed: 14 additions & 9 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py‎
Lines changed: 14 additions & 9 deletions
@@ -4,21 +4,20 @@
 import copy
 import functools
 import itertools
-import unittest
 from typing import Any, Optional, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import fully_shard
 from torch.nn.parallel.scatter_gather import _is_namedtuple
-from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     DoubleLinear,
     FSDPTest,
     FSDPTestMultiThread,
+    get_devtype,
     MLP,
 )
 from torch.testing._internal.common_utils import run_tests
@@ -28,10 +27,13 @@
 )
 
 
+device_type = torch.device(get_devtype())
+
+
 class TestFullyShardAutograd(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.get_device_module(device_type).device_count())
 
     def _reduce_1d_partial_grads(
         self, module: nn.Module, group: Optional[dist.ProcessGroup] = None
@@ -58,7 +60,7 @@ def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
         local_batch_size = 2
         global_batch_size, dim = (self.world_size * local_batch_size, 24)
         model = DoubleLinear(dim=dim, use_second_linear=True)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
@@ -68,7 +70,7 @@ def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
         for iter_idx in range(10):
             # Use all forward outputs in the loss/backward for the first half
             # of the iterations and only the 1st forward output for the rest
-            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            global_inp = torch.rand((global_batch_size, dim), device=device_type)
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -104,7 +106,7 @@ def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
         local_batch_size, dim = (2, 24)
         global_batch_size = self.world_size * local_batch_size
         model = DoubleLinear(dim=dim, use_second_linear=False)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model.lin2, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
@@ -113,7 +115,7 @@ def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
 
         torch.manual_seed(1)  # same on all ranks
         for iter_idx in range(10):
-            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            global_inp = torch.rand((global_batch_size, dim), device=device_type)
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -214,7 +216,7 @@ def forward(self, x: torch.Tensor):
             Module(dim),
             FromContainerType(container_type),
         )
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         for module in model:
             fully_shard(module)
         fully_shard(model)
@@ -223,7 +225,7 @@ def forward(self, x: torch.Tensor):
 
         torch.manual_seed(1)  # same on all ranks
         for iter_idx in range(10):
-            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            global_inp = torch.rand((global_batch_size, dim), device=device_type)
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -245,7 +247,7 @@ class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @skip_if_lt_x_gpu(1)
     def test_post_acc_grad_hook_runs(self):
         param_name_to_hook_count = collections.defaultdict(int)
 
@@ -260,7 +262,7 @@ def hook(param_name: str, param: torch.Tensor) -> None:
             param_hook = functools.partial(hook, param_name)
             param.register_post_accumulate_grad_hook(param_hook)
 
-        inp = torch.randn((2, 8), device="cuda")
+        inp = torch.randn((2, 8), device=device_type)
         model(inp).sum().backward()
         param_names = {param_name for param_name, _ in model.named_parameters()}
         self.assertEqual(param_names, set(param_name_to_hook_count.keys()))
@@ -271,7 +273,7 @@ def hook(param_name: str, param: torch.Tensor) -> None:
 class TestFullyShardPostAccGradHookMultiProcess(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.get_device_module(device_type).device_count(), 2)
 
     @skip_if_lt_x_gpu(2)
     def test_post_acc_grad_hook_optim_parity(self):
@@ -283,7 +285,7 @@ def test_post_acc_grad_hook_optim_parity(self):
         model_args = ModelArgs(dropout_p=0.0)
         model = Transformer(model_args)
 
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).to(device_type)
         for module in itertools.chain(ref_model.layers, [ref_model]):
             fully_shard(module)
         optim_kwargs = {"lr": 1e-2, "foreach": False}
@@ -312,7 +314,7 @@ def optim_hook(param: nn.Parameter) -> None:
             param.register_post_accumulate_grad_hook(optim_hook)
 
         torch.manual_seed(42 + self.rank)
-        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
         for _ in range(10):
             ref_loss = ref_model(inp).sum()
             ref_loss.backward()
 
@@ -11,7 +11,7 @@
 from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTest, MLPStack
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLPStack
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -20,6 +20,9 @@
 )
 
 
+device_type = torch.device(get_devtype())
+
+
 class _TestClipGradNormBase(FSDPTest):
     def _test_clip_grad_norm(
         self,
@@ -33,7 +36,7 @@ def _test_clip_grad_norm(
         dp_mesh: Optional[DeviceMesh] = None,
     ):
         vector_norm_fn = functools.partial(torch.linalg.vector_norm, ord=norm_type)
-        dp_mesh = dp_mesh or init_device_mesh("cuda", (self.world_size,))
+        dp_mesh = dp_mesh or init_device_mesh(device_type.type, (self.world_size,))
         torch.manual_seed(42 + dp_mesh.get_local_rank() + 1)
         for _ in range(10):
             ref_optim.zero_grad()
@@ -91,22 +94,24 @@ def _test_clip_grad_norm(
 class TestClipGradNormWorldSize2(_TestClipGradNormBase):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.get_device_module(device_type).device_count(), 2)
 
     @skip_if_lt_x_gpu(2)
     def test_clip_grad_norm_1d(self):
         for norm_type in (2, 1, float("inf")):
             torch.manual_seed(42)
             model_args = ModelArgs(dropout_p=0.0)
             model = Transformer(model_args)
-            ref_model = replicate(copy.deepcopy(model).cuda())
+            ref_model = replicate(copy.deepcopy(model).to(device_type))
             ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
             for module in model.modules():
                 if isinstance(module, TransformerBlock):
                     fully_shard(module)
             fully_shard(model)
             optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-            inp = torch.randint(0, model.model_args.vocab_size, (3, 16), device="cuda")
+            inp = torch.randint(
+                0, model.model_args.vocab_size, (3, 16), device=device_type
+            )
             self._test_clip_grad_norm(
                 1, norm_type, ref_model, ref_optim, model, optim, inp
             )
@@ -115,14 +120,14 @@ def test_clip_grad_norm_1d(self):
 class TestClipGradNormWorldSize4(_TestClipGradNormBase):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 4)
+        return min(torch.get_device_module(device_type).device_count(), 4)
 
     @skip_if_lt_x_gpu(4)
     def test_clip_grad_norm_2d(self):
         for norm_type in (2, 1, 3, float("inf")):
             dp_size = 2
             global_mesh = init_device_mesh(
-                "cuda",
+                device_type.type,
                 (dp_size, self.world_size // dp_size),
                 mesh_dim_names=("dp", "tp"),
             )
@@ -132,7 +137,7 @@ def test_clip_grad_norm_2d(self):
             # has some more significant numeric differences from the TP
             model = MLPStack(16, with_seq_parallel=True)
             ref_model = replicate(
-                copy.deepcopy(model).cuda(), process_group=dp_mesh.get_group()
+                copy.deepcopy(model).to(device_type), process_group=dp_mesh.get_group()
             )
             ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
             model.parallelize(
@@ -142,7 +147,7 @@ def test_clip_grad_norm_2d(self):
                 reshard_after_forward=True,
             )
             optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-            inp = torch.randn(2, 16, device="cuda")
+            inp = torch.randn(2, 16, device=device_type)
             self._test_clip_grad_norm(
                 0.5, norm_type, ref_model, ref_optim, model, optim, inp, dp_mesh
             )