Allow GSPMDSharding constructor to take in device_list (xc.DeviceList) as input along with Sequence[jax.Device]. This prevents extremely slow tuple(devices) -> DeviceList conversion in the GSPMDSharding constructor.

yashk2810 · Google-ML-Automation · commit 14b2c90f4acd · 2025-07-02T13:44:01.000-07:00
PiperOrigin-RevId: 778627673
diff --git a/jax/_src/array.py b/jax/_src/array.py
@@ -826,7 +826,7 @@ def get_data(index: Index | None) -> ArrayImpl | np.ndarray:
     )
 
   if dll is not None:
-    devices = [Format(dll, SingleDeviceSharding(d)) for d in devices]
+    devices = [Format(dll, SingleDeviceSharding(d)) for d in devices]  # type: ignore
     # pxla.batched_device_put doesn't support Layout... Take the slow route
     arrays = api.device_put(per_device_values, devices)
     return ArrayImpl(aval, sharding, arrays, committed=True)
diff --git a/jax/_src/interpreters/pxla.py b/jax/_src/interpreters/pxla.py
@@ -59,6 +59,7 @@
 from jax._src.interpreters import xla
 from jax._src.layout import Layout, AutoLayout, Format
 from jax._src.lib import xla_client as xc
+from jax._src.lib import jaxlib_extension_version
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import hlo
 from jax._src.partition_spec import PartitionSpec
@@ -2085,9 +2086,12 @@ class AllArgsInfo(NamedTuple):
 def to_gspmd_sharding(s: JSharding, ndim: int) -> GSPMDSharding:
   if isinstance(s, GSPMDSharding):
     return s
-  return GSPMDSharding(s._device_assignment, s._to_xla_hlo_sharding(ndim),
-                        memory_kind=s.memory_kind,
-                        _device_list=getattr(s, '_internal_device_list', None))
+  if jaxlib_extension_version >= 360:
+    return GSPMDSharding(s._internal_device_list, s._to_xla_hlo_sharding(ndim),
+                         memory_kind=s.memory_kind)
+  else:
+    return GSPMDSharding(s._device_assignment, s._to_xla_hlo_sharding(ndim),
+                         memory_kind=s.memory_kind)
 
 
 def _discharge_refs_jaxpr(closed_jaxpr, in_shardings, in_layouts,
@@ -2477,7 +2481,7 @@ def get_pspec_from_executable(
 
 def get_out_shardings_from_executable(
     xla_executable,
-    device_assignment: Sequence[xc.Device],
+    device_list: xc.DeviceList,
     num_out_avals: int,
     num_ordered_effects: int,
 ) -> Sequence[sharding_impls.GSPMDSharding] | None:
@@ -2492,9 +2496,14 @@ def get_out_shardings_from_executable(
 
   # When the device assignment only has 1 device, SPMD partitioner will not run.
   # Hence the op shardings will not be set on the `hlo_module`.
-  if len(device_assignment) == 1:
-    return [sharding_impls.GSPMDSharding.get_replicated(device_assignment, memory_kind=mk)
-            for mk in omk]
+  if len(device_list) == 1:
+    if jaxlib_extension_version >= 360:
+      return [sharding_impls.GSPMDSharding.get_replicated(device_list, memory_kind=mk)
+              for mk in omk]
+    else:
+      da = tuple(device_list)
+      return [sharding_impls.GSPMDSharding.get_replicated(da, memory_kind=mk)
+              for mk in omk]
 
   _, out_op_shardings = get_op_sharding_from_executable(xla_executable)
   if not out_op_shardings:
@@ -2518,19 +2527,27 @@ def get_out_shardings_from_executable(
   assert len(out_op_shardings) == num_out_avals == len(omk), (
       len(out_op_shardings), num_out_avals, len(omk))
 
-  return [sharding_impls.GSPMDSharding(device_assignment, os, memory_kind=mk)
-          for os, mk in safe_zip(out_op_shardings, omk)]
+  if jaxlib_extension_version >= 360:
+    return [sharding_impls.GSPMDSharding(device_list, os, memory_kind=mk)
+            for os, mk in safe_zip(out_op_shardings, omk)]
+  else:
+    da = tuple(device_list)
+    return [sharding_impls.GSPMDSharding(da, os, memory_kind=mk)
+            for os, mk in safe_zip(out_op_shardings, omk)]
 
 
 def _get_in_shardings_from_xla(
-    xla_executable, device_assignment: Sequence[xc.Device], num_in_avals: int,
+    xla_executable, device_list: xc.DeviceList, num_in_avals: int,
     num_ordered_effects: int
   ) -> Sequence[GSPMDSharding] | None:
   """Returns input shardings from XLA."""
   # When the device assignment only has 1 device, SPMD partitioner will not run.
   # Hence the op shardings will not be set on the `hlo_module`.
-  if len(device_assignment) == 1:
-    return [GSPMDSharding.get_replicated(device_assignment)] * num_in_avals
+  if len(device_list) == 1:
+    if jaxlib_extension_version >= 360:
+      return [GSPMDSharding.get_replicated(device_list)] * num_in_avals
+    else:
+      return [GSPMDSharding.get_replicated(tuple(device_list))] * num_in_avals
 
   in_op_shardings, _ = get_op_sharding_from_executable(xla_executable)
   if not in_op_shardings:
@@ -2542,8 +2559,11 @@ def _get_in_shardings_from_xla(
   assert len(in_op_shardings) == num_in_avals, (
       len(in_op_shardings), num_in_avals)
 
-  return [GSPMDSharding(device_assignment, os)
-          for os in in_op_shardings]
+  if jaxlib_extension_version >= 360:
+    return [GSPMDSharding(device_list, os) for os in in_op_shardings]
+  else:
+    da = tuple(device_list)
+    return [GSPMDSharding(da, os) for os in in_op_shardings]
 
 
 # TODO(yashkatariya): Remove this function after `AUTO` can return shardings
@@ -2758,8 +2778,8 @@ def _cached_compilation(computation, name, mesh, spmd_lowering,
 
 
 def _maybe_get_and_check_in_shardings(
-    xla_executable, in_shardings, device_assignment,
-    global_in_avals, num_ordered_effects):
+    xla_executable, in_shardings, device_list, global_in_avals,
+    num_ordered_effects):
   """Returns in_shardings extracted from XLA or checks and returns original
   shardings.
 
@@ -2770,8 +2790,7 @@ def _maybe_get_and_check_in_shardings(
   If in_sharding is unspecified, then the sharding returned by XLA is returned.
   """
   in_shardings_xla = _get_in_shardings_from_xla(
-      xla_executable, device_assignment, len(global_in_avals),
-      num_ordered_effects)
+      xla_executable, device_list, len(global_in_avals), num_ordered_effects)
   if in_shardings_xla is None:
     return in_shardings
 
@@ -2802,11 +2821,11 @@ def _maybe_get_and_check_in_shardings(
 
 
 def _maybe_get_and_check_out_shardings(
-    xla_executable, out_shardings, device_assignment, global_out_avals,
+    xla_executable, out_shardings, device_list, global_out_avals,
     num_ordered_effects
   ):
   out_shardings_xla = get_out_shardings_from_executable(
-      xla_executable, device_assignment, len(global_out_avals),
+      xla_executable, device_list, len(global_out_avals),
       num_ordered_effects)
   if out_shardings_xla is None:
     return out_shardings
@@ -2987,10 +3006,10 @@ def from_hlo(name: str,
       if pmap_nreps == 1:
         assert mesh is None
         in_shardings = _maybe_get_and_check_in_shardings(
-            xla_executable, in_shardings, tuple(device_list), global_in_avals,
+            xla_executable, in_shardings, device_list, global_in_avals,
             len(ordered_effects))
         out_shardings = _maybe_get_and_check_out_shardings(
-            xla_executable, out_shardings, tuple(device_list), global_out_avals,
+            xla_executable, out_shardings, device_list, global_out_avals,
             len(ordered_effects))
       else:
         in_shardings, out_shardings, committed, device_list = _get_metadata_jit_pmap(
diff --git a/jax/_src/pjit.py b/jax/_src/pjit.py
@@ -61,6 +61,7 @@
 from jax._src.lib.mlir.dialects import func as func_dialect
 from jax._src.lib import jax_jit
 from jax._src.lib import xla_client as xc
+from jax._src.lib import jaxlib_extension_version
 from jax._src.mesh import AbstractMesh
 from jax._src.sharding import Sharding
 from jax._src.sharding_impls import (
@@ -2151,8 +2152,7 @@ def _pjit_batcher(axis_data, vals_in,
 
 
 def _pjit_batcher_for_sharding(
-    s: Sharding | UnspecifiedValue,
-    dim: int | batching.RaggedAxis, spmd_axis_name: tuple[str, ...] | None,
+    s, dim: int | batching.RaggedAxis, spmd_axis_name: tuple[str, ...] | None,
     mesh, ndim: int):
   if isinstance(s, UnspecifiedValue):
     return s
@@ -2167,9 +2167,10 @@ def _pjit_batcher_for_sharding(
     tad = list(new_op.tile_assignment_dimensions)
     tad.insert(dim, 1)  # type: ignore
     new_op.tile_assignment_dimensions = tad
-    new_gs = GSPMDSharding(
-        s._device_assignment, new_op,
-        _device_list=getattr(s, '_internal_device_list', None))
+    if jaxlib_extension_version >= 360:
+      new_gs = GSPMDSharding(s._internal_device_list, new_op)
+    else:
+      new_gs = GSPMDSharding(s._device_assignment, new_op)
     return pxla._get_out_sharding_from_orig_sharding([new_gs], [None], s, None)[0]
   else:
     if isinstance(s, NamedSharding) and isinstance(s.mesh, AbstractMesh):
diff --git a/jax/_src/sharding.py b/jax/_src/sharding.py
@@ -126,6 +126,10 @@ def with_memory_kind(self, kind: str) -> Sharding:
   def _device_assignment(self) -> XLADeviceAssignment:
     raise NotImplementedError('Subclasses should implement this method.')
 
+  @property
+  def _internal_device_list(self) -> xc.DeviceList:
+    raise NotImplementedError('Subclasses should implement this method.')
+
   def _to_xla_hlo_sharding(self, num_dimensions: int) -> xc.HloSharding:
     raise NotImplementedError('Subclasses should implement this method.')
 
diff --git a/jax/_src/sharding_impls.py b/jax/_src/sharding_impls.py
@@ -33,6 +33,7 @@
 from jax._src import xla_bridge as xb
 from jax._src import mesh_utils
 from jax._src.lib import xla_client as xc
+from jax._src.lib import jaxlib_extension_version
 from jax._src.lib.mlir.dialects import sdy
 from jax._src.named_sharding import (  # noqa: F401
     SdyArray, SdyDim, UnspecifiedValue, AUTO,
@@ -360,22 +361,20 @@ def _unpickle_gspmd_sharding(devices, op_sharding, memory_kind):
 
 @use_cpp_class(xc.GSPMDSharding)
 class GSPMDSharding(jsharding.Sharding):
-  _devices: tuple[Device, ...]
+  _devices: xc.DeviceList
   _hlo_sharding: xc.HloSharding
   _memory_kind: str | None
-  _device_list: xc.DeviceList | None
   _internal_device_list: xc.DeviceList
 
   @use_cpp_method()
-  def __init__(self, devices: Sequence[Device],
+  def __init__(self, devices: Sequence[Device] | xc.DeviceList,
                op_sharding: xc.OpSharding | xc.HloSharding,
-               *, memory_kind: str | None = None,
-               _device_list: xc.DeviceList | None = None):
-    self._devices = tuple(devices)
-    if isinstance(op_sharding, xc.OpSharding):
-      self._hlo_sharding = xc.HloSharding.from_proto(op_sharding)
-    else:
-      self._hlo_sharding = op_sharding
+               *, memory_kind: str | None = None):
+    self._devices = (devices if isinstance(devices, xc.DeviceList) else
+                     xc.DeviceList(tuple(devices)))
+    self._hlo_sharding = (xc.HloSharding.from_proto(op_sharding)
+                          if isinstance(op_sharding, xc.OpSharding) else
+                          op_sharding)
     self._memory_kind = memory_kind
 
   def __reduce__(self):
@@ -417,7 +416,7 @@ def check_compatible_aval(self, aval_shape: Shape) -> None:
 
   @property
   def num_devices(self) -> int:
-    return len(self.device_set)
+    return len(self._internal_device_list)
 
   @functools.cached_property
   def device_set(self) -> set[Device]:
@@ -432,7 +431,7 @@ def with_memory_kind(self, kind: str) -> GSPMDSharding:
 
   @property
   def _device_assignment(self) -> XLADeviceAssignment:
-    return self._devices
+    return tuple(self._devices)
 
   def _to_xla_hlo_sharding(self, num_dimensions: int) -> xc.HloSharding:
     return self._hlo_sharding
@@ -468,7 +467,7 @@ def is_fully_addressable(self) -> bool:
 
   @classmethod
   def get_replicated(cls, device_assignment, *, memory_kind: str | None = None):
-    return cls(tuple(device_assignment), replicated_hlo_sharding,
+    return cls(device_assignment, replicated_hlo_sharding,
                memory_kind=memory_kind)
 
 
@@ -982,12 +981,15 @@ def make_key_array_phys_sharding(aval, sharding):
     return sharding.update(spec=PartitionSpec(*sharding.spec, *trailing_spec))
   else:
     hlos = sharding._to_xla_hlo_sharding(aval.ndim)
-    return GSPMDSharding(
-        sharding._device_assignment, physical_hlo_sharding(aval, hlos))
+    if jaxlib_extension_version >= 360:
+      return GSPMDSharding(
+          sharding._internal_device_list, physical_hlo_sharding(aval, hlos))
+    else:
+      return GSPMDSharding(
+          sharding._device_assignment, physical_hlo_sharding(aval, hlos))
 
 
-def physical_sharding(
-    aval, sharding: jsharding.Sharding) -> jsharding.Sharding:
+def physical_sharding(aval, sharding: jsharding.Sharding) -> jsharding.Sharding:
   return make_key_array_phys_sharding(aval, sharding)
 
 
@@ -1001,8 +1003,12 @@ def get_logical_gspmd_sharding(logical_shape, dtype, phys_sharding):
   logical_op_sharding = phys_hlo_sharding.to_proto().clone()
   tad = partitions[:-elt_aval.ndim] + suffix
   logical_op_sharding.tile_assignment_dimensions = tad
-  return GSPMDSharding(phys_sharding._device_assignment,
-                       xc.HloSharding.from_proto(logical_op_sharding))
+  if jaxlib_extension_version >= 360:
+    return GSPMDSharding(phys_sharding._internal_device_list,
+                         xc.HloSharding.from_proto(logical_op_sharding))
+  else:
+    return GSPMDSharding(phys_sharding._device_assignment,
+                         xc.HloSharding.from_proto(logical_op_sharding))
 
 def check_replicated_trailing_dims(sharding: jsharding.Sharding,
                                    logical_shape, dtype):
diff --git a/jaxlib/sharding.cc b/jaxlib/sharding.cc
@@ -271,22 +271,18 @@ PmapSharding::PmapSharding(xla::nb_numpy_ndarray devices,
   type_ = nanobind::type<PmapSharding>().inc_ref().ptr();
 }
 
-GSPMDSharding::GSPMDSharding(nb::sequence devices, xla::HloSharding op_sharding,
-                             nb::object memory_kind, nb::object device_list)
+GSPMDSharding::GSPMDSharding(xla::nb_class_ptr<PyDeviceList> devices,
+                             xla::HloSharding op_sharding,
+                             nb::object memory_kind)
     : Sharding(/*num_devices=*/nb::len(devices.ptr())),
-      devices_(nb::tuple(devices)),
+      devices_(std::move(devices)),
       hlo_sharding_(std::move(op_sharding)),
       memory_kind_(std::move(memory_kind)) {
-  if (device_list.is_none()) {
-    internal_device_list_ = xla::make_nb_class<PyDeviceList>(devices_);
-  } else {
-    internal_device_list_ =
-        nb::cast<xla::nb_class_ptr<jax::PyDeviceList>>(std::move(device_list));
-  }
+  internal_device_list_ = devices_;
   // This checks in python if the memory kind is correct for the given
   // devices. Currently in python this check is optimized but we want to
   // move that check to C++ after which we can remove this call.
-  CHECK(devices_.size() != 0)
+  CHECK(devices_->Len() != 0)
       << "Devices given to GSPMDSharding must not be empty";
   memory_kind_ =
       CheckAndCanonicalizeMemoryKind(memory_kind_, internal_device_list_);
@@ -346,14 +342,20 @@ void RegisterSharding(nb::module_& m) {
   PmapSharding::InitializeType();
 
   nb::class_<GSPMDSharding, Sharding>(m, "GSPMDSharding", nb::dynamic_attr())
-      .def(nb::init<nb::sequence, xla::OpSharding, nb::object, nb::object>(),
+      .def(nb::init<nb::sequence, xla::OpSharding, nb::object>(),
            nb::arg("devices"), nb::arg("op_sharding"),
-           nb::arg("memory_kind").none() = nb::none(),
-           nb::arg("_device_list").none() = nb::none())
-      .def(nb::init<nb::sequence, xla::HloSharding, nb::object, nb::object>(),
+           nb::arg("memory_kind").none() = nb::none())
+      .def(nb::init<nb::sequence, xla::HloSharding, nb::object>(),
            nb::arg("devices"), nb::arg("op_sharding"),
-           nb::arg("memory_kind").none() = nb::none(),
-           nb::arg("_device_list").none() = nb::none())
+           nb::arg("memory_kind").none() = nb::none())
+      .def(nb::init<xla::nb_class_ptr<PyDeviceList>, xla::OpSharding,
+                    nb::object>(),
+           nb::arg("devices"), nb::arg("op_sharding"),
+           nb::arg("memory_kind").none() = nb::none())
+      .def(nb::init<xla::nb_class_ptr<PyDeviceList>, xla::HloSharding,
+                    nb::object>(),
+           nb::arg("devices"), nb::arg("op_sharding"),
+           nb::arg("memory_kind").none() = nb::none())
       .def_prop_ro("_devices", &GSPMDSharding::devices)
       .def_prop_ro("_hlo_sharding", &GSPMDSharding::hlo_sharding)
       .def_prop_ro("_memory_kind", &GSPMDSharding::memory_kind)
diff --git a/jaxlib/sharding.h b/jaxlib/sharding.h
@@ -164,16 +164,29 @@ class PmapSharding : public Sharding {
 class GSPMDSharding : public Sharding {
  public:
   GSPMDSharding(nanobind::sequence devices, xla::OpSharding op_sharding,
-                nanobind::object memory_kind, nanobind::object device_list)
+                nanobind::object memory_kind)
       : GSPMDSharding(
-            std::move(devices),
+            xla::make_nb_class<PyDeviceList>(nanobind::tuple(devices)),
             xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding)),
-            std::move(memory_kind), std::move(device_list)) {}
+            std::move(memory_kind)) {}
 
   GSPMDSharding(nanobind::sequence devices, xla::HloSharding op_sharding,
-                nanobind::object memory_kind, nanobind::object device_list);
+                nanobind::object memory_kind)
+      : GSPMDSharding(
+            xla::make_nb_class<PyDeviceList>(nanobind::tuple(devices)),
+            std::move(op_sharding), std::move(memory_kind)) {}
+
+  GSPMDSharding(xla::nb_class_ptr<PyDeviceList> devices,
+                xla::OpSharding op_sharding, nanobind::object memory_kind)
+      : GSPMDSharding(
+            std::move(devices),
+            xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding)),
+            std::move(memory_kind)) {}
+
+  GSPMDSharding(xla::nb_class_ptr<PyDeviceList> devices,
+                xla::HloSharding op_sharding, nanobind::object memory_kind);
 
-  const nanobind::tuple& devices() const { return devices_; }
+  xla::nb_class_ptr<PyDeviceList> devices() const { return devices_; }
   const nanobind::object& memory_kind() const { return memory_kind_; }
 
   size_t Hash() {
@@ -226,7 +239,7 @@ class GSPMDSharding : public Sharding {
     return hlo_sharding().IsReplicated();
   }
 
-  nanobind::tuple devices_;
+  xla::nb_class_ptr<PyDeviceList> devices_;
   xla::HloSharding hlo_sharding_;
   nanobind::object memory_kind_;
   std::optional<size_t> hash_;
diff --git a/jaxlib/xla_client.py b/jaxlib/xla_client.py

Original file line number	Diff line number	Diff line change
`@@ -826,7 +826,7 @@ def get_data(index: Index \| None) -> ArrayImpl \| np.ndarray:`
`826`	`826`	`)`
`827`	`827`
`828`	`828`	`if dll is not None:`
`829`		`- devices = [Format(dll, SingleDeviceSharding(d)) for d in devices]`
	`829`	`+ devices = [Format(dll, SingleDeviceSharding(d)) for d in devices] # type: ignore`
`830`	`830`	`# pxla.batched_device_put doesn't support Layout... Take the slow route`
`831`	`831`	`arrays = api.device_put(per_device_values, devices)`
`832`	`832`	`return ArrayImpl(aval, sharding, arrays, committed=True)`