pytorch · shoumikhin · Jun 8, 2026 · Jun 8, 2026 · shoumikhin · Jun 6, 2026
@@ -199,6 +199,22 @@ auto tensor = clone_tensor_ptr(original_tensor);
 
 Note that, regardless of whether the original `TensorPtr` owns the data or not, the newly created `TensorPtr` will own a copy of the data.
 
+#### Cloning To or From a Device
+
+If a tensor lives on CPU and you want a copy on an accelerator, or the other way around, use `clone_tensor_ptr_to` with the device you want. It allocates memory on the target device, copies the data for you, and the returned `TensorPtr` owns that memory.
+
+```cpp
+auto cpu_tensor = make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+
+// CPU to device:
+auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA);
+
+// Device back to CPU:
+auto host_tensor = clone_tensor_ptr_to(device_tensor, DeviceType::CPU);
+```
+
+The direction is chosen from the source and target device. This needs a `DeviceAllocator` registered for the device, so it is available only in the portable (non-`USE_ATEN_LIB`) build. For a plain CPU-to-CPU copy, use `clone_tensor_ptr` instead.
+
 ### Resizing Tensors
 
 The `TensorShapeDynamism` enum specifies the mutability of a tensor's shape:
@@ -375,6 +391,7 @@ Here's a table matching `TensorPtr` creation functions with their corresponding
 | `at::tensor(data, type)`                    | `make_tensor_ptr(data, type)`               |
 | `at::tensor(data, type).reshape(sizes)`     | `make_tensor_ptr(sizes, data, type)`        |
 | `tensor.clone()`                            | `clone_tensor_ptr(tensor)`                  |
+| `tensor.to(device)`                         | `clone_tensor_ptr_to(tensor, device)`       |
 | `tensor.resize_(new_sizes)`                 | `resize_tensor_ptr(tensor, new_sizes)`      |
 | `at::scalar_tensor(value)`                  | `scalar_tensor(value)`                      |
 | `at::from_blob(data, sizes, type)`          | `from_blob(data, sizes, type)`              |

@@ -24,11 +24,11 @@ def define_common_targets():
             ],
             visibility = ["PUBLIC"],
             deps = [
-                "//executorch/runtime/core:device_allocator",
                 "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
             exported_deps = [
+                "//executorch/runtime/core:device_allocator",
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
             ],

@@ -198,6 +198,15 @@ TensorPtr make_tensor_ptr(
 TensorPtr clone_tensor_ptr(
     const executorch::aten::Tensor& tensor,
     executorch::aten::ScalarType type) {
+#ifndef USE_ATEN_LIB
+  ET_CHECK_MSG(
+      tensor.device_type() == runtime::etensor::DeviceType::CPU,
+      "clone_tensor_ptr only supports CPU tensors; use clone_tensor_ptr_to with a CPU target first.");
+#else // USE_ATEN_LIB
+  ET_CHECK_MSG(
+      tensor.is_cpu(),
+      "clone_tensor_ptr only supports CPU tensors; move it to CPU first (e.g. tensor.to(torch::kCPU)).");
+#endif // USE_ATEN_LIB
   std::vector<executorch::aten::SizesType> sizes(
       tensor.sizes().begin(), tensor.sizes().end());
   std::vector<executorch::aten::DimOrderType> dim_order{
@@ -252,11 +261,11 @@ TensorPtr clone_tensor_ptr(
   } ctx;
 
   ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
-      tensor_type, ctx, "clone_tensor_ptr_from", CTYPE_FROM, [&] {
+      tensor_type, ctx, "clone_tensor_ptr_cast_from", CTYPE_FROM, [&] {
         const CTYPE_FROM* tensor_data_ptr =
             static_cast<const CTYPE_FROM*>(tensor_data);
         ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
-            type, ctx, "clone_tensor_ptr_to", CTYPE_TO, [&] {
+            type, ctx, "clone_tensor_ptr_cast_to", CTYPE_TO, [&] {
               CTYPE_TO* data_ptr = reinterpret_cast<CTYPE_TO*>(data.data());
               std::transform(
                   tensor_data_ptr,
@@ -285,98 +294,84 @@ runtime::Error resize_tensor_ptr(
           sizes.data(), sizes.size()));
 }
 
-// ---- Device tensor helpers ----
+// ---- Device tensor helper ----
 //
-// These helpers rely on the ExecuTorch DeviceAllocator and the portable tensor
+// This helper relies on the ExecuTorch DeviceAllocator and the portable tensor
 // metadata APIs (dim_order, shape_dynamism, device), which have no equivalent
-// in USE_ATEN_LIB builds, so they are compiled out there.
+// in USE_ATEN_LIB builds, so it is compiled out there.
 
 #ifndef USE_ATEN_LIB
 
-TensorPtr clone_tensor_ptr_to_device(
-    const TensorPtr& cpu_tensor,
-    executorch::aten::Device device) {
+TensorPtr clone_tensor_ptr_to(
+    const TensorPtr& tensor,
+    executorch::aten::Device target) {
+  const auto source = tensor->device();
   ET_CHECK_MSG(
-      cpu_tensor->device().is_cpu(),
-      "Source tensor must reside on CPU; got device type %d.",
-      static_cast<int>(cpu_tensor->device_type()));
-
+      !(source.is_cpu() && target.is_cpu()),
+      "clone_tensor_ptr_to does not copy CPU-to-CPU; use clone_tensor_ptr.");
   ET_CHECK_MSG(
-      !device.is_cpu(),
-      "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies.");
+      source.is_cpu() || target.is_cpu(),
+      "Device-to-device copy is not supported; route through CPU.");
 
+  const auto nbytes = tensor->nbytes();
+  const auto* src_data = tensor->const_data_ptr();
+  ET_CHECK_MSG(src_data != nullptr, "Source tensor has no data.");
+
+  // Whichever end is not CPU provides the allocator.
+  const auto device = target.is_cpu() ? source : target;
   auto* allocator = runtime::get_device_allocator(device.type());
   ET_CHECK_MSG(
       allocator != nullptr,
       "No device allocator registered for device type %d",
       static_cast<int>(device.type()));
 
-  const auto nbytes = cpu_tensor->nbytes();
-  const auto* cpu_data = cpu_tensor->const_data_ptr();
-  ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data.");
-
-  auto result = allocator->allocate(nbytes, device.index());
-  ET_CHECK_MSG(result.ok(), "Failed to allocate device memory.");
-  void* device_data = result.get();
-
-  auto err = allocator->copy_host_to_device(
-      device_data, cpu_data, nbytes, device.index());
-  ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed.");
-
   std::vector<executorch::aten::SizesType> sizes(
-      cpu_tensor->sizes().begin(), cpu_tensor->sizes().end());
+      tensor->sizes().begin(), tensor->sizes().end());
   std::vector<executorch::aten::DimOrderType> dim_order(
-      cpu_tensor->dim_order().begin(), cpu_tensor->dim_order().end());
+      tensor->dim_order().begin(), tensor->dim_order().end());
   std::vector<executorch::aten::StridesType> strides(
-      cpu_tensor->strides().begin(), cpu_tensor->strides().end());
+      tensor->strides().begin(), tensor->strides().end());
+
+  if (target.is_cpu()) {
+    std::vector<uint8_t> cpu_data(nbytes);
+    auto err = allocator->copy_device_to_host(
+        cpu_data.data(), src_data, nbytes, source.index());
+    ET_CHECK_MSG(
+        err == runtime::Error::Ok,
+        "Device-to-host copy failed: error %d",
+        static_cast<int>(err));
+    return make_tensor_ptr(
+        std::move(sizes),
+        std::move(cpu_data),
+        std::move(dim_order),
+        std::move(strides),
+        tensor->scalar_type(),
+        tensor->shape_dynamism());
+  }
 
+  auto result = allocator->allocate(nbytes, target.index());
+  ET_CHECK_MSG(
+      result.ok(),
+      "Failed to allocate device memory: error %d",
+      static_cast<int>(result.error()));
+  void* device_data = result.get();
+  auto err = allocator->copy_host_to_device(
+      device_data, src_data, nbytes, target.index());
+  ET_CHECK_MSG(
+      err == runtime::Error::Ok,
+      "Host-to-device copy failed: error %d",
+      static_cast<int>(err));
   return make_tensor_ptr(
       std::move(sizes),
       device_data,
       std::move(dim_order),
       std::move(strides),
-      cpu_tensor->scalar_type(),
-      cpu_tensor->shape_dynamism(),
-      [allocator, device](void* ptr) {
-        allocator->deallocate(ptr, device.index());
+      tensor->scalar_type(),
+      tensor->shape_dynamism(),
+      [allocator, target](void* ptr) {
+        allocator->deallocate(ptr, target.index());
       },
-      device);
-}
-
-TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) {
-  const auto nbytes = device_tensor->nbytes();
-  const auto* device_data = device_tensor->const_data_ptr();
-  ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data.");
-
-  const auto device = device_tensor->device();
-  ET_CHECK_MSG(!device.is_cpu(), "Source tensor is already on CPU.");
-
-  auto* allocator = runtime::get_device_allocator(device.type());
-  ET_CHECK_MSG(
-      allocator != nullptr,
-      "No device allocator registered for device type %d",
-      static_cast<int>(device.type()));
-
-  std::vector<uint8_t> cpu_data(nbytes);
-
-  auto err = allocator->copy_device_to_host(
-      cpu_data.data(), device_data, nbytes, device.index());
-  ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed.");
-
-  std::vector<executorch::aten::SizesType> sizes(
-      device_tensor->sizes().begin(), device_tensor->sizes().end());
-  std::vector<executorch::aten::DimOrderType> dim_order(
-      device_tensor->dim_order().begin(), device_tensor->dim_order().end());
-  std::vector<executorch::aten::StridesType> strides(
-      device_tensor->strides().begin(), device_tensor->strides().end());
-
-  return make_tensor_ptr(
-      std::move(sizes),
-      std::move(cpu_data),
-      std::move(dim_order),
-      std::move(strides),
-      device_tensor->scalar_type(),
-      device_tensor->shape_dynamism());
+      target);
 }
 
 #endif // USE_ATEN_LIB

@@ -36,7 +36,7 @@ using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
  * allocated or copied. The caller is responsible for ensuring `data` already
  * lives on the requested device; construct the `executorch::aten::Device` from
  * the runtime environment and pass it in. To copy CPU data to a device, use
- * `clone_tensor_ptr_to_device` instead.
+ * `clone_tensor_ptr_to` instead.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param data A pointer to the data buffer (CPU or device, see device).
@@ -110,7 +110,7 @@ inline TensorPtr make_tensor_ptr(
  * vectors of one type and a different scalar type.
  *
  * The result is always a CPU tensor. To move it to a device, use
- * `clone_tensor_ptr_to_device`.
+ * `clone_tensor_ptr_to`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param sizes A vector specifying the size of each dimension.
@@ -204,7 +204,7 @@ inline TensorPtr make_tensor_ptr(
  * vector's data type.
  *
  * The result is always a CPU tensor. To move it to a device, use
- * `clone_tensor_ptr_to_device`.
+ * `clone_tensor_ptr_to`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param data A vector containing the tensor's data.
@@ -236,7 +236,7 @@ inline TensorPtr make_tensor_ptr(
  * from the initializer list's data type.
  *
  * The result is always a CPU tensor. To move it to a device, use
- * `clone_tensor_ptr_to_device`.
+ * `clone_tensor_ptr_to`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the initializer
  * list.
@@ -278,7 +278,7 @@ inline TensorPtr make_tensor_ptr(
  * initializer list's elements.
  *
  * The result is always a CPU tensor. To move it to a device, use
- * `clone_tensor_ptr_to_device`.
+ * `clone_tensor_ptr_to`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the initializer
  * list.
@@ -375,7 +375,7 @@ inline TensorPtr make_tensor_ptr(
  * is left empty so the core may infer it from the provided strides.
  *
  * This overload always aliases — it never copies. To copy a tensor's data to
- * a device, use `clone_tensor_ptr_to_device`.
+ * a device, use `clone_tensor_ptr_to`.
  *
  * @param tensor The source tensor to alias.
  * @param sizes Optional sizes override.
@@ -426,18 +426,21 @@ inline TensorPtr make_tensor_ptr(
       tensor.scalar_type(),
 #ifndef USE_ATEN_LIB
       tensor.shape_dynamism(),
+      std::move(deleter),
+      executorch::aten::Device(tensor.device_type(), tensor.device_index()));
 #else // USE_ATEN_LIB
       executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      std::move(deleter),
+      tensor.device());
 #endif // USE_ATEN_LIB
-      std::move(deleter));
 }
 
 /**
  * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
  * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
  *
  * This overload always aliases — it never copies. To copy a tensor's data to
- * a device, use `clone_tensor_ptr_to_device`.
+ * a device, use `clone_tensor_ptr_to`.
  *
  * @param tensor_ptr The source tensor pointer to alias.
  * @param sizes Optional sizes override.
@@ -527,38 +530,29 @@ runtime::Error resize_tensor_ptr(
     const std::vector<executorch::aten::SizesType>& sizes);
 
 /**
- * Clones a CPU TensorPtr to a device TensorPtr.
- *
- * Allocates memory on the specified device and copies the tensor data from
- * host to device using the DeviceAllocator registered for the given device
- * type. The returned TensorPtr owns the device memory and will free it via
- * the allocator when destroyed.
+ * Clones a TensorPtr's data onto the given target device, allocating and
+ * copying as needed.
  *
- * Only available in the ExecuTorch portable build: cloning relies on the
- * ExecuTorch DeviceAllocator, which has no equivalent in USE_ATEN_LIB builds.
- *
- * @param cpu_tensor The source CPU tensor whose data will be copied.
- * @param device The target device (must not be CPU).
- * @return A TensorPtr backed by device memory containing the copied data.
- */
-#ifndef USE_ATEN_LIB
-TensorPtr clone_tensor_ptr_to_device(
-    const TensorPtr& cpu_tensor,
-    executorch::aten::Device device);
-
-/**
- * Clones a device TensorPtr to a CPU TensorPtr.
+ * The transfer direction is inferred from the source and target device:
+ * host-to-device when `target` is an accelerator, and device-to-host when
+ * `target` is CPU. Copies use the DeviceAllocator registered for the
+ * accelerator side; a device-backed result owns its memory and frees it via
+ * that allocator when destroyed.
  *
- * Allocates host memory and copies the tensor data from device to host using
- * the DeviceAllocator registered for the source tensor's device type. The
- * device is determined from the source tensor's metadata.
+ * Source and target must differ in device domain: for a CPU-to-CPU copy use
+ * clone_tensor_ptr, and device-to-device transfers are not supported.
  *
- * Only available in the ExecuTorch portable build.
+ * Only available in the ExecuTorch portable build: it relies on the ExecuTorch
+ * DeviceAllocator, which has no equivalent in USE_ATEN_LIB builds.
  *
- * @param device_tensor The source device tensor whose data will be copied.
- * @return A TensorPtr backed by CPU memory containing the copied data.
+ * @param tensor The source tensor whose data will be copied.
+ * @param target The destination device (CPU or an accelerator).
+ * @return A TensorPtr backed by `target` memory containing the copied data.
  */
-TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor);
+#ifndef USE_ATEN_LIB
+TensorPtr clone_tensor_ptr_to(
+    const TensorPtr& tensor,
+    executorch::aten::Device target);
 #endif // USE_ATEN_LIB
 
 } // namespace extension

@@ -19,7 +19,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-set(_test_srcs tensor_ptr_maker_test.cpp tensor_ptr_test.cpp)
+set(_test_srcs tensor_ptr_maker_test.cpp tensor_ptr_test.cpp
+               tensor_ptr_device_test.cpp
+)
 
 et_cxx_test(
   extension_tensor_test SOURCES ${_test_srcs} EXTRA_LIBS extension_tensor