Optimize safetensors loading with 1D contiguous reads and ICI resharding.

BlaziusMaximus · Orbax Authors · commit 3a9ba8fbc95a · 2026-03-31T13:25:52.000-07:00
PiperOrigin-RevId: 892406249
diff --git a/checkpoint/orbax/checkpoint/experimental/v1/_src/layout/safetensors_layout.py b/checkpoint/orbax/checkpoint/experimental/v1/_src/layout/safetensors_layout.py
@@ -185,6 +185,26 @@ async def _load_safetensors_on_device(
   """Loads tensors from a safetensors file into on-device JAX arrays."""
   header, data_start_offset = await _read_safetensors_header(path)
   restored_pytree = {}
+
+  num_hosts = jax.process_count()
+  host_id = jax.process_index()
+
+  # Build an initial mesh grouping all global devices by host
+  devices_by_host = []
+  for i in range(num_hosts):
+    devices_by_host.append([d for d in jax.devices() if d.process_index == i])
+
+  # Ensure uniform mesh shape (in case of uneven device counts, which is rare)
+  min_devices = min(len(d) for d in devices_by_host)
+  devices_by_host = [d[:min_devices] for d in devices_by_host]
+
+  initial_mesh = jax.sharding.Mesh(
+      np.array(devices_by_host), ("hosts", "devices")
+  )
+  flat_sharding = jax.sharding.NamedSharding(
+      initial_mesh, jax.sharding.PartitionSpec("hosts")
+  )
+
   async with async_path.open_file(path, mode="rb") as f:
     for tensor_name, abstract_leaf in abstract_pytree.items():
       if tensor_name not in header:
@@ -195,7 +215,6 @@ async def _load_safetensors_on_device(
       stored_shape, stored_dtype = _get_array_properties(header[tensor_name])
       st_data_offsets = header[tensor_name]["data_offsets"]
       sharding = abstract_leaf.sharding
-      target_shape = abstract_leaf.shape
       target_dtype = abstract_leaf.dtype
 
       if sharding is None:
@@ -211,33 +230,54 @@ async def _load_safetensors_on_device(
         restored_pytree[tensor_name] = jax.device_put(np_array)
         continue
 
-      device_indices_map = sharding.addressable_devices_indices_map(
-          target_shape
-      )
+      # We have a target sharding.
+      # Use 1D flat contiguous read + reshard logic for maximum IO throughput.
+      total_elements = int(np.prod(stored_shape)) if stored_shape else 1
 
-      device_map = []
-      for device in device_indices_map:
-        idx = device_indices_map[device]
-        resolved_idx = numpy_utils.resolve_slice(idx, stored_shape)
-        shard_shape = numpy_utils.slice_shape(resolved_idx)
-
-        shard_np = await _read_non_contiguous_slice(
-            f,
-            resolved_idx,
-            stored_shape,
-            stored_dtype,
-            st_data_offsets[0] + data_start_offset,
-        )
-        shard_np = shard_np.reshape(shard_shape)  # pytype: disable=attribute-error
+      # Calculate padding
+      elements_per_host = (total_elements + num_hosts - 1) // num_hosts
+      padded_elements = elements_per_host * num_hosts
+
+      # Calculate what this host needs to read
+      start_idx = host_id * elements_per_host
+      end_idx = min((host_id + 1) * elements_per_host, total_elements)
+      num_elements_to_read = max(0, end_idx - start_idx)
+      itemsize = np.dtype(stored_dtype).itemsize
 
-        if shard_np.dtype != target_dtype:
-          shard_np = shard_np.astype(target_dtype)
+      start_byte = st_data_offsets[0] + data_start_offset + start_idx * itemsize
+      num_bytes = num_elements_to_read * itemsize
 
-        device_map.append(jax.device_put(shard_np, device))
+      await f.seek(start_byte)
+      raw_data = await f.read(num_bytes)
 
-      restored_pytree[tensor_name] = jax.make_array_from_single_device_arrays(
-          target_shape, sharding, device_map
+      local_data = np.frombuffer(raw_data, dtype=stored_dtype)
+      if local_data.dtype != target_dtype:
+        local_data = local_data.astype(target_dtype)
+
+      if num_elements_to_read < elements_per_host:
+        local_data = np.pad(
+            local_data, (0, elements_per_host - num_elements_to_read)
+        )
+
+      # Put local data on all addressable devices in the flat sharding
+      local_arrays = [
+          jax.device_put(local_data, d)
+          for d in flat_sharding.addressable_devices
+      ]
+
+      # Create the 1D sharded array
+      flat_array = jax.make_array_from_single_device_arrays(
+          (padded_elements,), flat_sharding, local_arrays
       )
+
+      # Slice off the padding and reshape
+      if padded_elements > total_elements:
+        flat_array = flat_array[:total_elements]
+
+      reshaped_array = flat_array.reshape(stored_shape)
+
+      # Reshard to the target sharding
+      restored_pytree[tensor_name] = jax.device_put(reshaped_array, sharding)
   return restored_pytree