-
Notifications
You must be signed in to change notification settings - Fork 1k
Address review feedback on device tensor helpers (#20078) #20078
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -198,6 +198,15 @@ TensorPtr make_tensor_ptr( | |
| TensorPtr clone_tensor_ptr( | ||
| const executorch::aten::Tensor& tensor, | ||
| executorch::aten::ScalarType type) { | ||
| #ifndef USE_ATEN_LIB | ||
| ET_CHECK_MSG( | ||
| tensor.device_type() == runtime::etensor::DeviceType::CPU, | ||
| "clone_tensor_ptr only supports CPU tensors; use clone_tensor_ptr_to with a CPU target first."); | ||
| #else // USE_ATEN_LIB | ||
| ET_CHECK_MSG( | ||
| tensor.is_cpu(), | ||
| "clone_tensor_ptr only supports CPU tensors; move it to CPU first (e.g. tensor.to(torch::kCPU))."); | ||
| #endif // USE_ATEN_LIB | ||
|
Comment on lines
+201
to
+209
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in the current version — |
||
| std::vector<executorch::aten::SizesType> sizes( | ||
| tensor.sizes().begin(), tensor.sizes().end()); | ||
| std::vector<executorch::aten::DimOrderType> dim_order{ | ||
|
|
@@ -252,11 +261,11 @@ TensorPtr clone_tensor_ptr( | |
| } ctx; | ||
|
|
||
| ET_SWITCH_REALHBBF16_AND_UINT_TYPES( | ||
| tensor_type, ctx, "clone_tensor_ptr_from", CTYPE_FROM, [&] { | ||
| tensor_type, ctx, "clone_tensor_ptr_cast_from", CTYPE_FROM, [&] { | ||
| const CTYPE_FROM* tensor_data_ptr = | ||
| static_cast<const CTYPE_FROM*>(tensor_data); | ||
| ET_SWITCH_REALHBBF16_AND_UINT_TYPES( | ||
| type, ctx, "clone_tensor_ptr_to", CTYPE_TO, [&] { | ||
| type, ctx, "clone_tensor_ptr_cast_to", CTYPE_TO, [&] { | ||
| CTYPE_TO* data_ptr = reinterpret_cast<CTYPE_TO*>(data.data()); | ||
| std::transform( | ||
| tensor_data_ptr, | ||
|
|
@@ -285,98 +294,84 @@ runtime::Error resize_tensor_ptr( | |
| sizes.data(), sizes.size())); | ||
| } | ||
|
|
||
| // ---- Device tensor helpers ---- | ||
| // ---- Device tensor helper ---- | ||
| // | ||
| // These helpers rely on the ExecuTorch DeviceAllocator and the portable tensor | ||
| // This helper relies on the ExecuTorch DeviceAllocator and the portable tensor | ||
| // metadata APIs (dim_order, shape_dynamism, device), which have no equivalent | ||
| // in USE_ATEN_LIB builds, so they are compiled out there. | ||
| // in USE_ATEN_LIB builds, so it is compiled out there. | ||
|
|
||
| #ifndef USE_ATEN_LIB | ||
|
|
||
| TensorPtr clone_tensor_ptr_to_device( | ||
| const TensorPtr& cpu_tensor, | ||
| executorch::aten::Device device) { | ||
| TensorPtr clone_tensor_ptr_to( | ||
| const TensorPtr& tensor, | ||
| executorch::aten::Device target) { | ||
| const auto source = tensor->device(); | ||
| ET_CHECK_MSG( | ||
| cpu_tensor->device().is_cpu(), | ||
| "Source tensor must reside on CPU; got device type %d.", | ||
| static_cast<int>(cpu_tensor->device_type())); | ||
|
|
||
| !(source.is_cpu() && target.is_cpu()), | ||
| "clone_tensor_ptr_to does not copy CPU-to-CPU; use clone_tensor_ptr."); | ||
| ET_CHECK_MSG( | ||
| !device.is_cpu(), | ||
| "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies."); | ||
| source.is_cpu() || target.is_cpu(), | ||
| "Device-to-device copy is not supported; route through CPU."); | ||
|
|
||
| const auto nbytes = tensor->nbytes(); | ||
| const auto* src_data = tensor->const_data_ptr(); | ||
| ET_CHECK_MSG(src_data != nullptr, "Source tensor has no data."); | ||
|
|
||
| // Whichever end is not CPU provides the allocator. | ||
| const auto device = target.is_cpu() ? source : target; | ||
| auto* allocator = runtime::get_device_allocator(device.type()); | ||
| ET_CHECK_MSG( | ||
| allocator != nullptr, | ||
| "No device allocator registered for device type %d", | ||
| static_cast<int>(device.type())); | ||
|
|
||
| const auto nbytes = cpu_tensor->nbytes(); | ||
| const auto* cpu_data = cpu_tensor->const_data_ptr(); | ||
| ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data."); | ||
|
|
||
| auto result = allocator->allocate(nbytes, device.index()); | ||
| ET_CHECK_MSG(result.ok(), "Failed to allocate device memory."); | ||
| void* device_data = result.get(); | ||
|
|
||
| auto err = allocator->copy_host_to_device( | ||
| device_data, cpu_data, nbytes, device.index()); | ||
| ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed."); | ||
|
|
||
| std::vector<executorch::aten::SizesType> sizes( | ||
| cpu_tensor->sizes().begin(), cpu_tensor->sizes().end()); | ||
| tensor->sizes().begin(), tensor->sizes().end()); | ||
| std::vector<executorch::aten::DimOrderType> dim_order( | ||
| cpu_tensor->dim_order().begin(), cpu_tensor->dim_order().end()); | ||
| tensor->dim_order().begin(), tensor->dim_order().end()); | ||
| std::vector<executorch::aten::StridesType> strides( | ||
| cpu_tensor->strides().begin(), cpu_tensor->strides().end()); | ||
| tensor->strides().begin(), tensor->strides().end()); | ||
|
|
||
| if (target.is_cpu()) { | ||
| std::vector<uint8_t> cpu_data(nbytes); | ||
| auto err = allocator->copy_device_to_host( | ||
| cpu_data.data(), src_data, nbytes, source.index()); | ||
| ET_CHECK_MSG( | ||
| err == runtime::Error::Ok, | ||
| "Device-to-host copy failed: error %d", | ||
| static_cast<int>(err)); | ||
| return make_tensor_ptr( | ||
| std::move(sizes), | ||
| std::move(cpu_data), | ||
| std::move(dim_order), | ||
| std::move(strides), | ||
| tensor->scalar_type(), | ||
| tensor->shape_dynamism()); | ||
| } | ||
|
|
||
| auto result = allocator->allocate(nbytes, target.index()); | ||
| ET_CHECK_MSG( | ||
| result.ok(), | ||
| "Failed to allocate device memory: error %d", | ||
| static_cast<int>(result.error())); | ||
| void* device_data = result.get(); | ||
| auto err = allocator->copy_host_to_device( | ||
| device_data, src_data, nbytes, target.index()); | ||
| ET_CHECK_MSG( | ||
| err == runtime::Error::Ok, | ||
| "Host-to-device copy failed: error %d", | ||
| static_cast<int>(err)); | ||
| return make_tensor_ptr( | ||
| std::move(sizes), | ||
| device_data, | ||
| std::move(dim_order), | ||
| std::move(strides), | ||
| cpu_tensor->scalar_type(), | ||
| cpu_tensor->shape_dynamism(), | ||
| [allocator, device](void* ptr) { | ||
| allocator->deallocate(ptr, device.index()); | ||
| tensor->scalar_type(), | ||
| tensor->shape_dynamism(), | ||
| [allocator, target](void* ptr) { | ||
| allocator->deallocate(ptr, target.index()); | ||
| }, | ||
| device); | ||
| } | ||
|
|
||
| TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) { | ||
| const auto nbytes = device_tensor->nbytes(); | ||
| const auto* device_data = device_tensor->const_data_ptr(); | ||
| ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data."); | ||
|
|
||
| const auto device = device_tensor->device(); | ||
| ET_CHECK_MSG(!device.is_cpu(), "Source tensor is already on CPU."); | ||
|
|
||
| auto* allocator = runtime::get_device_allocator(device.type()); | ||
| ET_CHECK_MSG( | ||
| allocator != nullptr, | ||
| "No device allocator registered for device type %d", | ||
| static_cast<int>(device.type())); | ||
|
|
||
| std::vector<uint8_t> cpu_data(nbytes); | ||
|
|
||
| auto err = allocator->copy_device_to_host( | ||
| cpu_data.data(), device_data, nbytes, device.index()); | ||
| ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed."); | ||
|
|
||
| std::vector<executorch::aten::SizesType> sizes( | ||
| device_tensor->sizes().begin(), device_tensor->sizes().end()); | ||
| std::vector<executorch::aten::DimOrderType> dim_order( | ||
| device_tensor->dim_order().begin(), device_tensor->dim_order().end()); | ||
| std::vector<executorch::aten::StridesType> strides( | ||
| device_tensor->strides().begin(), device_tensor->strides().end()); | ||
|
|
||
| return make_tensor_ptr( | ||
| std::move(sizes), | ||
| std::move(cpu_data), | ||
| std::move(dim_order), | ||
| std::move(strides), | ||
| device_tensor->scalar_type(), | ||
| device_tensor->shape_dynamism()); | ||
| target); | ||
| } | ||
|
|
||
| #endif // USE_ATEN_LIB | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,7 +36,7 @@ using TensorPtr = std::shared_ptr<executorch::aten::Tensor>; | |
| * allocated or copied. The caller is responsible for ensuring `data` already | ||
| * lives on the requested device; construct the `executorch::aten::Device` from | ||
| * the runtime environment and pass it in. To copy CPU data to a device, use | ||
| * `clone_tensor_ptr_to_device` instead. | ||
| * `clone_tensor_ptr_to` instead. | ||
| * | ||
| * @param sizes A vector specifying the size of each dimension. | ||
| * @param data A pointer to the data buffer (CPU or device, see device). | ||
|
|
@@ -110,7 +110,7 @@ inline TensorPtr make_tensor_ptr( | |
| * vectors of one type and a different scalar type. | ||
| * | ||
| * The result is always a CPU tensor. To move it to a device, use | ||
| * `clone_tensor_ptr_to_device`. | ||
| * `clone_tensor_ptr_to`. | ||
| * | ||
| * @tparam T The C++ type of the tensor elements, deduced from the vector. | ||
| * @param sizes A vector specifying the size of each dimension. | ||
|
|
@@ -204,7 +204,7 @@ inline TensorPtr make_tensor_ptr( | |
| * vector's data type. | ||
| * | ||
| * The result is always a CPU tensor. To move it to a device, use | ||
| * `clone_tensor_ptr_to_device`. | ||
| * `clone_tensor_ptr_to`. | ||
| * | ||
| * @tparam T The C++ type of the tensor elements, deduced from the vector. | ||
| * @param data A vector containing the tensor's data. | ||
|
|
@@ -236,7 +236,7 @@ inline TensorPtr make_tensor_ptr( | |
| * from the initializer list's data type. | ||
| * | ||
| * The result is always a CPU tensor. To move it to a device, use | ||
| * `clone_tensor_ptr_to_device`. | ||
| * `clone_tensor_ptr_to`. | ||
| * | ||
| * @tparam T The C++ type of the tensor elements, deduced from the initializer | ||
| * list. | ||
|
|
@@ -278,7 +278,7 @@ inline TensorPtr make_tensor_ptr( | |
| * initializer list's elements. | ||
| * | ||
| * The result is always a CPU tensor. To move it to a device, use | ||
| * `clone_tensor_ptr_to_device`. | ||
| * `clone_tensor_ptr_to`. | ||
| * | ||
| * @tparam T The C++ type of the tensor elements, deduced from the initializer | ||
| * list. | ||
|
|
@@ -375,7 +375,7 @@ inline TensorPtr make_tensor_ptr( | |
| * is left empty so the core may infer it from the provided strides. | ||
| * | ||
| * This overload always aliases — it never copies. To copy a tensor's data to | ||
| * a device, use `clone_tensor_ptr_to_device`. | ||
| * a device, use `clone_tensor_ptr_to`. | ||
| * | ||
| * @param tensor The source tensor to alias. | ||
| * @param sizes Optional sizes override. | ||
|
|
@@ -426,18 +426,21 @@ inline TensorPtr make_tensor_ptr( | |
| tensor.scalar_type(), | ||
| #ifndef USE_ATEN_LIB | ||
| tensor.shape_dynamism(), | ||
| std::move(deleter), | ||
| executorch::aten::Device(tensor.device_type(), tensor.device_index())); | ||
| #else // USE_ATEN_LIB | ||
| executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, | ||
| std::move(deleter), | ||
| tensor.device()); | ||
| #endif // USE_ATEN_LIB | ||
|
Comment on lines
431
to
+435
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch — addressed in the latest version. The |
||
| std::move(deleter)); | ||
| } | ||
|
|
||
| /** | ||
| * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...). | ||
| * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed. | ||
| * | ||
| * This overload always aliases — it never copies. To copy a tensor's data to | ||
| * a device, use `clone_tensor_ptr_to_device`. | ||
| * a device, use `clone_tensor_ptr_to`. | ||
| * | ||
| * @param tensor_ptr The source tensor pointer to alias. | ||
| * @param sizes Optional sizes override. | ||
|
|
@@ -527,38 +530,29 @@ runtime::Error resize_tensor_ptr( | |
| const std::vector<executorch::aten::SizesType>& sizes); | ||
|
|
||
| /** | ||
| * Clones a CPU TensorPtr to a device TensorPtr. | ||
| * | ||
| * Allocates memory on the specified device and copies the tensor data from | ||
| * host to device using the DeviceAllocator registered for the given device | ||
| * type. The returned TensorPtr owns the device memory and will free it via | ||
| * the allocator when destroyed. | ||
| * Clones a TensorPtr's data onto the given target device, allocating and | ||
| * copying as needed. | ||
| * | ||
| * Only available in the ExecuTorch portable build: cloning relies on the | ||
| * ExecuTorch DeviceAllocator, which has no equivalent in USE_ATEN_LIB builds. | ||
| * | ||
| * @param cpu_tensor The source CPU tensor whose data will be copied. | ||
| * @param device The target device (must not be CPU). | ||
| * @return A TensorPtr backed by device memory containing the copied data. | ||
| */ | ||
| #ifndef USE_ATEN_LIB | ||
| TensorPtr clone_tensor_ptr_to_device( | ||
| const TensorPtr& cpu_tensor, | ||
| executorch::aten::Device device); | ||
|
|
||
| /** | ||
| * Clones a device TensorPtr to a CPU TensorPtr. | ||
| * The transfer direction is inferred from the source and target device: | ||
| * host-to-device when `target` is an accelerator, and device-to-host when | ||
| * `target` is CPU. Copies use the DeviceAllocator registered for the | ||
| * accelerator side; a device-backed result owns its memory and frees it via | ||
| * that allocator when destroyed. | ||
| * | ||
| * Allocates host memory and copies the tensor data from device to host using | ||
| * the DeviceAllocator registered for the source tensor's device type. The | ||
| * device is determined from the source tensor's metadata. | ||
| * Source and target must differ in device domain: for a CPU-to-CPU copy use | ||
| * clone_tensor_ptr, and device-to-device transfers are not supported. | ||
| * | ||
| * Only available in the ExecuTorch portable build. | ||
| * Only available in the ExecuTorch portable build: it relies on the ExecuTorch | ||
| * DeviceAllocator, which has no equivalent in USE_ATEN_LIB builds. | ||
| * | ||
| * @param device_tensor The source device tensor whose data will be copied. | ||
| * @return A TensorPtr backed by CPU memory containing the copied data. | ||
| * @param tensor The source tensor whose data will be copied. | ||
| * @param target The destination device (CPU or an accelerator). | ||
| * @return A TensorPtr backed by `target` memory containing the copied data. | ||
| */ | ||
| TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor); | ||
| #ifndef USE_ATEN_LIB | ||
| TensorPtr clone_tensor_ptr_to( | ||
| const TensorPtr& tensor, | ||
| executorch::aten::Device target); | ||
| #endif // USE_ATEN_LIB | ||
|
|
||
| } // namespace extension | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch — addressed in the latest version. Added an
#else // USE_ATEN_LIBbranch that guards ontensor.is_cpu()with an ATen-appropriate message, so a CUDA/accelerator ATen tensor aborts cleanly instead of silently host-reading device memory.