diff --git a/docs/source/extension-tensor.md b/docs/source/extension-tensor.md index 910c06053ed..81b8a617adc 100644 --- a/docs/source/extension-tensor.md +++ b/docs/source/extension-tensor.md @@ -199,6 +199,22 @@ auto tensor = clone_tensor_ptr(original_tensor); Note that, regardless of whether the original `TensorPtr` owns the data or not, the newly created `TensorPtr` will own a copy of the data. +#### Cloning To or From a Device + +If a tensor lives on CPU and you want a copy on an accelerator, or the other way around, use `clone_tensor_ptr_to` with the device you want. It allocates memory on the target device, copies the data for you, and the returned `TensorPtr` owns that memory. + +```cpp +auto cpu_tensor = make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + +// CPU to device: +auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + +// Device back to CPU: +auto host_tensor = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); +``` + +The direction is chosen from the source and target device. This needs a `DeviceAllocator` registered for the device, so it is available only in the portable (non-`USE_ATEN_LIB`) build. For a plain CPU-to-CPU copy, use `clone_tensor_ptr` instead. + ### Resizing Tensors The `TensorShapeDynamism` enum specifies the mutability of a tensor's shape: @@ -375,6 +391,7 @@ Here's a table matching `TensorPtr` creation functions with their corresponding | `at::tensor(data, type)` | `make_tensor_ptr(data, type)` | | `at::tensor(data, type).reshape(sizes)` | `make_tensor_ptr(sizes, data, type)` | | `tensor.clone()` | `clone_tensor_ptr(tensor)` | +| `tensor.to(device)` | `clone_tensor_ptr_to(tensor, device)` | | `tensor.resize_(new_sizes)` | `resize_tensor_ptr(tensor, new_sizes)` | | `at::scalar_tensor(value)` | `scalar_tensor(value)` | | `at::from_blob(data, sizes, type)` | `from_blob(data, sizes, type)` | diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index 6a5c40f9857..5fcac79534b 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -24,11 +24,11 @@ def define_common_targets(): ], visibility = ["PUBLIC"], deps = [ - "//executorch/runtime/core:device_allocator", "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], exported_deps = [ + "//executorch/runtime/core:device_allocator", "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, ], diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp index 006365d92d0..fb01c57541c 100644 --- a/extension/tensor/tensor_ptr.cpp +++ b/extension/tensor/tensor_ptr.cpp @@ -198,6 +198,15 @@ TensorPtr make_tensor_ptr( TensorPtr clone_tensor_ptr( const executorch::aten::Tensor& tensor, executorch::aten::ScalarType type) { +#ifndef USE_ATEN_LIB + ET_CHECK_MSG( + tensor.device_type() == runtime::etensor::DeviceType::CPU, + "clone_tensor_ptr only supports CPU tensors; use clone_tensor_ptr_to with a CPU target first."); +#else // USE_ATEN_LIB + ET_CHECK_MSG( + tensor.is_cpu(), + "clone_tensor_ptr only supports CPU tensors; move it to CPU first (e.g. tensor.to(torch::kCPU))."); +#endif // USE_ATEN_LIB std::vector sizes( tensor.sizes().begin(), tensor.sizes().end()); std::vector dim_order{ @@ -252,11 +261,11 @@ TensorPtr clone_tensor_ptr( } ctx; ET_SWITCH_REALHBBF16_AND_UINT_TYPES( - tensor_type, ctx, "clone_tensor_ptr_from", CTYPE_FROM, [&] { + tensor_type, ctx, "clone_tensor_ptr_cast_from", CTYPE_FROM, [&] { const CTYPE_FROM* tensor_data_ptr = static_cast(tensor_data); ET_SWITCH_REALHBBF16_AND_UINT_TYPES( - type, ctx, "clone_tensor_ptr_to", CTYPE_TO, [&] { + type, ctx, "clone_tensor_ptr_cast_to", CTYPE_TO, [&] { CTYPE_TO* data_ptr = reinterpret_cast(data.data()); std::transform( tensor_data_ptr, @@ -285,98 +294,84 @@ runtime::Error resize_tensor_ptr( sizes.data(), sizes.size())); } -// ---- Device tensor helpers ---- +// ---- Device tensor helper ---- // -// These helpers rely on the ExecuTorch DeviceAllocator and the portable tensor +// This helper relies on the ExecuTorch DeviceAllocator and the portable tensor // metadata APIs (dim_order, shape_dynamism, device), which have no equivalent -// in USE_ATEN_LIB builds, so they are compiled out there. +// in USE_ATEN_LIB builds, so it is compiled out there. #ifndef USE_ATEN_LIB -TensorPtr clone_tensor_ptr_to_device( - const TensorPtr& cpu_tensor, - executorch::aten::Device device) { +TensorPtr clone_tensor_ptr_to( + const TensorPtr& tensor, + executorch::aten::Device target) { + const auto source = tensor->device(); ET_CHECK_MSG( - cpu_tensor->device().is_cpu(), - "Source tensor must reside on CPU; got device type %d.", - static_cast(cpu_tensor->device_type())); - + !(source.is_cpu() && target.is_cpu()), + "clone_tensor_ptr_to does not copy CPU-to-CPU; use clone_tensor_ptr."); ET_CHECK_MSG( - !device.is_cpu(), - "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies."); + source.is_cpu() || target.is_cpu(), + "Device-to-device copy is not supported; route through CPU."); + const auto nbytes = tensor->nbytes(); + const auto* src_data = tensor->const_data_ptr(); + ET_CHECK_MSG(src_data != nullptr, "Source tensor has no data."); + + // Whichever end is not CPU provides the allocator. + const auto device = target.is_cpu() ? source : target; auto* allocator = runtime::get_device_allocator(device.type()); ET_CHECK_MSG( allocator != nullptr, "No device allocator registered for device type %d", static_cast(device.type())); - const auto nbytes = cpu_tensor->nbytes(); - const auto* cpu_data = cpu_tensor->const_data_ptr(); - ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data."); - - auto result = allocator->allocate(nbytes, device.index()); - ET_CHECK_MSG(result.ok(), "Failed to allocate device memory."); - void* device_data = result.get(); - - auto err = allocator->copy_host_to_device( - device_data, cpu_data, nbytes, device.index()); - ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed."); - std::vector sizes( - cpu_tensor->sizes().begin(), cpu_tensor->sizes().end()); + tensor->sizes().begin(), tensor->sizes().end()); std::vector dim_order( - cpu_tensor->dim_order().begin(), cpu_tensor->dim_order().end()); + tensor->dim_order().begin(), tensor->dim_order().end()); std::vector strides( - cpu_tensor->strides().begin(), cpu_tensor->strides().end()); + tensor->strides().begin(), tensor->strides().end()); + + if (target.is_cpu()) { + std::vector cpu_data(nbytes); + auto err = allocator->copy_device_to_host( + cpu_data.data(), src_data, nbytes, source.index()); + ET_CHECK_MSG( + err == runtime::Error::Ok, + "Device-to-host copy failed: error %d", + static_cast(err)); + return make_tensor_ptr( + std::move(sizes), + std::move(cpu_data), + std::move(dim_order), + std::move(strides), + tensor->scalar_type(), + tensor->shape_dynamism()); + } + auto result = allocator->allocate(nbytes, target.index()); + ET_CHECK_MSG( + result.ok(), + "Failed to allocate device memory: error %d", + static_cast(result.error())); + void* device_data = result.get(); + auto err = allocator->copy_host_to_device( + device_data, src_data, nbytes, target.index()); + ET_CHECK_MSG( + err == runtime::Error::Ok, + "Host-to-device copy failed: error %d", + static_cast(err)); return make_tensor_ptr( std::move(sizes), device_data, std::move(dim_order), std::move(strides), - cpu_tensor->scalar_type(), - cpu_tensor->shape_dynamism(), - [allocator, device](void* ptr) { - allocator->deallocate(ptr, device.index()); + tensor->scalar_type(), + tensor->shape_dynamism(), + [allocator, target](void* ptr) { + allocator->deallocate(ptr, target.index()); }, - device); -} - -TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) { - const auto nbytes = device_tensor->nbytes(); - const auto* device_data = device_tensor->const_data_ptr(); - ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data."); - - const auto device = device_tensor->device(); - ET_CHECK_MSG(!device.is_cpu(), "Source tensor is already on CPU."); - - auto* allocator = runtime::get_device_allocator(device.type()); - ET_CHECK_MSG( - allocator != nullptr, - "No device allocator registered for device type %d", - static_cast(device.type())); - - std::vector cpu_data(nbytes); - - auto err = allocator->copy_device_to_host( - cpu_data.data(), device_data, nbytes, device.index()); - ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed."); - - std::vector sizes( - device_tensor->sizes().begin(), device_tensor->sizes().end()); - std::vector dim_order( - device_tensor->dim_order().begin(), device_tensor->dim_order().end()); - std::vector strides( - device_tensor->strides().begin(), device_tensor->strides().end()); - - return make_tensor_ptr( - std::move(sizes), - std::move(cpu_data), - std::move(dim_order), - std::move(strides), - device_tensor->scalar_type(), - device_tensor->shape_dynamism()); + target); } #endif // USE_ATEN_LIB diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index f9a89a05f30..ffe13cb5c3d 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -36,7 +36,7 @@ using TensorPtr = std::shared_ptr; * allocated or copied. The caller is responsible for ensuring `data` already * lives on the requested device; construct the `executorch::aten::Device` from * the runtime environment and pass it in. To copy CPU data to a device, use - * `clone_tensor_ptr_to_device` instead. + * `clone_tensor_ptr_to` instead. * * @param sizes A vector specifying the size of each dimension. * @param data A pointer to the data buffer (CPU or device, see device). @@ -110,7 +110,7 @@ inline TensorPtr make_tensor_ptr( * vectors of one type and a different scalar type. * * The result is always a CPU tensor. To move it to a device, use - * `clone_tensor_ptr_to_device`. + * `clone_tensor_ptr_to`. * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param sizes A vector specifying the size of each dimension. @@ -204,7 +204,7 @@ inline TensorPtr make_tensor_ptr( * vector's data type. * * The result is always a CPU tensor. To move it to a device, use - * `clone_tensor_ptr_to_device`. + * `clone_tensor_ptr_to`. * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param data A vector containing the tensor's data. @@ -236,7 +236,7 @@ inline TensorPtr make_tensor_ptr( * from the initializer list's data type. * * The result is always a CPU tensor. To move it to a device, use - * `clone_tensor_ptr_to_device`. + * `clone_tensor_ptr_to`. * * @tparam T The C++ type of the tensor elements, deduced from the initializer * list. @@ -278,7 +278,7 @@ inline TensorPtr make_tensor_ptr( * initializer list's elements. * * The result is always a CPU tensor. To move it to a device, use - * `clone_tensor_ptr_to_device`. + * `clone_tensor_ptr_to`. * * @tparam T The C++ type of the tensor elements, deduced from the initializer * list. @@ -375,7 +375,7 @@ inline TensorPtr make_tensor_ptr( * is left empty so the core may infer it from the provided strides. * * This overload always aliases — it never copies. To copy a tensor's data to - * a device, use `clone_tensor_ptr_to_device`. + * a device, use `clone_tensor_ptr_to`. * * @param tensor The source tensor to alias. * @param sizes Optional sizes override. @@ -426,10 +426,13 @@ inline TensorPtr make_tensor_ptr( tensor.scalar_type(), #ifndef USE_ATEN_LIB tensor.shape_dynamism(), + std::move(deleter), + executorch::aten::Device(tensor.device_type(), tensor.device_index())); #else // USE_ATEN_LIB executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + std::move(deleter), + tensor.device()); #endif // USE_ATEN_LIB - std::move(deleter)); } /** @@ -437,7 +440,7 @@ inline TensorPtr make_tensor_ptr( * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed. * * This overload always aliases — it never copies. To copy a tensor's data to - * a device, use `clone_tensor_ptr_to_device`. + * a device, use `clone_tensor_ptr_to`. * * @param tensor_ptr The source tensor pointer to alias. * @param sizes Optional sizes override. @@ -527,38 +530,29 @@ runtime::Error resize_tensor_ptr( const std::vector& sizes); /** - * Clones a CPU TensorPtr to a device TensorPtr. - * - * Allocates memory on the specified device and copies the tensor data from - * host to device using the DeviceAllocator registered for the given device - * type. The returned TensorPtr owns the device memory and will free it via - * the allocator when destroyed. + * Clones a TensorPtr's data onto the given target device, allocating and + * copying as needed. * - * Only available in the ExecuTorch portable build: cloning relies on the - * ExecuTorch DeviceAllocator, which has no equivalent in USE_ATEN_LIB builds. - * - * @param cpu_tensor The source CPU tensor whose data will be copied. - * @param device The target device (must not be CPU). - * @return A TensorPtr backed by device memory containing the copied data. - */ -#ifndef USE_ATEN_LIB -TensorPtr clone_tensor_ptr_to_device( - const TensorPtr& cpu_tensor, - executorch::aten::Device device); - -/** - * Clones a device TensorPtr to a CPU TensorPtr. + * The transfer direction is inferred from the source and target device: + * host-to-device when `target` is an accelerator, and device-to-host when + * `target` is CPU. Copies use the DeviceAllocator registered for the + * accelerator side; a device-backed result owns its memory and frees it via + * that allocator when destroyed. * - * Allocates host memory and copies the tensor data from device to host using - * the DeviceAllocator registered for the source tensor's device type. The - * device is determined from the source tensor's metadata. + * Source and target must differ in device domain: for a CPU-to-CPU copy use + * clone_tensor_ptr, and device-to-device transfers are not supported. * - * Only available in the ExecuTorch portable build. + * Only available in the ExecuTorch portable build: it relies on the ExecuTorch + * DeviceAllocator, which has no equivalent in USE_ATEN_LIB builds. * - * @param device_tensor The source device tensor whose data will be copied. - * @return A TensorPtr backed by CPU memory containing the copied data. + * @param tensor The source tensor whose data will be copied. + * @param target The destination device (CPU or an accelerator). + * @return A TensorPtr backed by `target` memory containing the copied data. */ -TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor); +#ifndef USE_ATEN_LIB +TensorPtr clone_tensor_ptr_to( + const TensorPtr& tensor, + executorch::aten::Device target); #endif // USE_ATEN_LIB } // namespace extension diff --git a/extension/tensor/test/CMakeLists.txt b/extension/tensor/test/CMakeLists.txt index 0e5fd1d97ef..4512c3405d4 100644 --- a/extension/tensor/test/CMakeLists.txt +++ b/extension/tensor/test/CMakeLists.txt @@ -19,7 +19,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) -set(_test_srcs tensor_ptr_maker_test.cpp tensor_ptr_test.cpp) +set(_test_srcs tensor_ptr_maker_test.cpp tensor_ptr_test.cpp + tensor_ptr_device_test.cpp +) et_cxx_test( extension_tensor_test SOURCES ${_test_srcs} EXTRA_LIBS extension_tensor diff --git a/extension/tensor/test/tensor_ptr_device_test.cpp b/extension/tensor/test/tensor_ptr_device_test.cpp index aedd34a6cf1..d8e573ed394 100644 --- a/extension/tensor/test/tensor_ptr_device_test.cpp +++ b/extension/tensor/test/tensor_ptr_device_test.cpp @@ -57,7 +57,7 @@ class TensorPtrDeviceTest : public ::testing::Test { TEST_F(TensorPtrDeviceTest, CpuToDeviceTensor) { auto cpu_tensor = make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(device_tensor->dim(), 2); EXPECT_EQ(device_tensor->size(0), 2); @@ -77,7 +77,7 @@ TEST_F(TensorPtrDeviceTest, CpuToDeviceTensor) { TEST_F(TensorPtrDeviceTest, CpuToDeviceFromRawData) { constexpr std::array data{10.0f, 20.0f, 30.0f, 40.0f}; auto cpu_tensor = make_tensor_ptr({2, 2}, const_cast(data.data())); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(device_tensor->dim(), 2); EXPECT_EQ(device_tensor->size(0), 2); @@ -94,13 +94,13 @@ TEST_F(TensorPtrDeviceTest, CpuToDeviceFromRawData) { EXPECT_EQ(g_mock_cuda.h2d_count_, 1); } -// clone_tensor_ptr_to_cpu relies on TensorImpl device metadata which is only -// available in the non-ATen (ExecuTorch portable) path. +// Device-to-host clone needs TensorImpl device metadata, available only in the +// non-ATen (ExecuTorch portable) path. TEST_F(TensorPtrDeviceTest, DeviceToCpuTensor) { auto cpu_tensor = make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto result_tensor = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto result_tensor = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(result_tensor->dim(), 2); EXPECT_EQ(result_tensor->size(0), 2); @@ -124,8 +124,8 @@ TEST_F(TensorPtrDeviceTest, DeviceToCpuPreservesShapeDynamism) { {}, executorch::aten::ScalarType::Float, executorch::aten::TensorShapeDynamism::STATIC); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto result_tensor = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto result_tensor = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ( result_tensor->shape_dynamism(), @@ -136,8 +136,8 @@ TEST_F(TensorPtrDeviceTest, RoundtripCpuDeviceCpu) { const std::vector original = {1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f}; auto cpu_tensor = make_tensor_ptr({2, 3}, original); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto roundtrip_tensor = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto roundtrip_tensor = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_NE(roundtrip_tensor->const_data_ptr(), cpu_tensor->const_data_ptr()); EXPECT_NE( @@ -157,8 +157,8 @@ TEST_F(TensorPtrDeviceTest, RoundtripCpuDeviceCpu) { TEST_F(TensorPtrDeviceTest, RoundtripInt32) { auto cpu_tensor = make_tensor_ptr({4}, std::vector{10, 20, 30, 40}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Int); const std::vector expected = {10, 20, 30, 40}; @@ -170,12 +170,12 @@ TEST_F(TensorPtrDeviceTest, RoundtripInt32) { TEST_F(TensorPtrDeviceTest, DeviceIndexPropagation) { auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); - auto device_tensor = clone_tensor_ptr_to_device( - cpu_tensor, Device(DeviceType::CUDA, /*index=*/1)); + auto device_tensor = + clone_tensor_ptr_to(cpu_tensor, Device(DeviceType::CUDA, /*index=*/1)); EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 1); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[0], 1.0f); EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[1], 2.0f); } @@ -183,8 +183,7 @@ TEST_F(TensorPtrDeviceTest, DeviceIndexPropagation) { TEST_F(TensorPtrDeviceTest, DeviceMemoryCleanup) { { auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); - auto device_tensor = - clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(g_mock_cuda.allocate_count_, 1); EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); } @@ -193,12 +192,12 @@ TEST_F(TensorPtrDeviceTest, DeviceMemoryCleanup) { TEST_F(TensorPtrDeviceTest, ScalarTensorRoundtrip) { auto cpu_tensor = make_tensor_ptr({}, {42.0f}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(device_tensor->dim(), 0); EXPECT_EQ(device_tensor->numel(), 1); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(roundtrip->dim(), 0); EXPECT_EQ(roundtrip->numel(), 1); EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[0], 42.0f); @@ -207,8 +206,8 @@ TEST_F(TensorPtrDeviceTest, ScalarTensorRoundtrip) { TEST_F(TensorPtrDeviceTest, RawDataRoundtrip) { constexpr std::array raw_data{100.0f, 200.0f, 300.0f}; auto cpu_tensor = make_tensor_ptr({3}, const_cast(raw_data.data())); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(roundtrip->dim(), 1); EXPECT_EQ(roundtrip->size(0), 3); @@ -218,26 +217,32 @@ TEST_F(TensorPtrDeviceTest, RawDataRoundtrip) { EXPECT_FLOAT_EQ(data[2], 300.0f); } -TEST_F(TensorPtrDeviceTest, ErrorCpuTargetDevice) { +TEST_F(TensorPtrDeviceTest, ErrorCpuToCpu) { auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); - ET_EXPECT_DEATH(clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CPU), ""); + ET_EXPECT_DEATH( + clone_tensor_ptr_to(cpu_tensor, DeviceType::CPU), + "does not copy CPU-to-CPU"); } TEST_F(TensorPtrDeviceTest, ErrorNullCpuTensorData) { auto null_tensor = make_tensor_ptr({2, 2}, nullptr); ET_EXPECT_DEATH( - clone_tensor_ptr_to_device(null_tensor, DeviceType::CUDA), ""); + clone_tensor_ptr_to(null_tensor, DeviceType::CUDA), + "Source tensor has no data"); } -TEST_F(TensorPtrDeviceTest, ErrorCpuTensorToCpu) { +TEST_F(TensorPtrDeviceTest, ErrorDeviceToDevice) { auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); - ET_EXPECT_DEATH(clone_tensor_ptr_to_cpu(cpu_tensor), ""); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + ET_EXPECT_DEATH( + clone_tensor_ptr_to(device_tensor, Device(DeviceType::CUDA, /*index=*/1)), + "Device-to-device copy is not supported"); } TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) { auto cpu_tensor = make_tensor_ptr({2, 2}, std::vector{1.0f, 2.0f, 3.0f, 4.0f}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(device_tensor->dim(), 2); EXPECT_EQ(device_tensor->size(0), 2); @@ -248,7 +253,7 @@ TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) { EXPECT_EQ(g_mock_cuda.allocate_count_, 1); EXPECT_EQ(g_mock_cuda.h2d_count_, 1); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); auto* data = roundtrip->const_data_ptr(); EXPECT_FLOAT_EQ(data[0], 1.0f); EXPECT_FLOAT_EQ(data[1], 2.0f); @@ -259,7 +264,7 @@ TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) { TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) { constexpr std::array raw{5.0f, 6.0f, 7.0f}; auto cpu_tensor = make_tensor_ptr({3}, const_cast(raw.data())); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(device_tensor->dim(), 1); EXPECT_EQ(device_tensor->size(0), 3); @@ -270,7 +275,7 @@ TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) { EXPECT_EQ(g_mock_cuda.allocate_count_, 1); EXPECT_EQ(g_mock_cuda.h2d_count_, 1); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); auto* data = roundtrip->const_data_ptr(); EXPECT_FLOAT_EQ(data[0], 5.0f); EXPECT_FLOAT_EQ(data[1], 6.0f); @@ -279,8 +284,8 @@ TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) { TEST_F(TensorPtrDeviceTest, CloneToCpuVerifiesCpuDeviceMetadata) { auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto result = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto result = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(result->unsafeGetTensorImpl()->device_type(), DeviceType::CPU); EXPECT_EQ(result->unsafeGetTensorImpl()->device_index(), 0); @@ -288,8 +293,8 @@ TEST_F(TensorPtrDeviceTest, CloneToCpuVerifiesCpuDeviceMetadata) { TEST_F(TensorPtrDeviceTest, MultipleClonesFromSameSource) { auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f}); - auto device1 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto device2 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device1 = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto device2 = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_NE(device1->const_data_ptr(), device2->const_data_ptr()); EXPECT_EQ(g_mock_cuda.allocate_count_, 2); @@ -302,14 +307,14 @@ TEST_F(TensorPtrDeviceTest, HighDimensionalTensorRoundtrip) { data[i] = static_cast(i); } auto cpu_tensor = make_tensor_ptr({2, 3, 4}, data); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); EXPECT_EQ(device_tensor->dim(), 3); EXPECT_EQ(device_tensor->size(0), 2); EXPECT_EQ(device_tensor->size(1), 3); EXPECT_EQ(device_tensor->size(2), 4); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); auto* result = roundtrip->const_data_ptr(); for (size_t i = 0; i < 24; ++i) { EXPECT_FLOAT_EQ(result[i], static_cast(i)); @@ -318,8 +323,8 @@ TEST_F(TensorPtrDeviceTest, HighDimensionalTensorRoundtrip) { TEST_F(TensorPtrDeviceTest, RoundtripDouble) { auto cpu_tensor = make_tensor_ptr({3}, std::vector{1.1, 2.2, 3.3}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Double); auto* data = roundtrip->const_data_ptr(); @@ -330,8 +335,8 @@ TEST_F(TensorPtrDeviceTest, RoundtripDouble) { TEST_F(TensorPtrDeviceTest, RoundtripInt64) { auto cpu_tensor = make_tensor_ptr({3}, std::vector{100, 200, 300}); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Long); auto* data = roundtrip->const_data_ptr(); @@ -347,8 +352,8 @@ TEST_F(TensorPtrDeviceTest, LargeTensorRoundtrip) { data[i] = static_cast(i) * 0.1f; } auto cpu_tensor = make_tensor_ptr({static_cast(n)}, data); - auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); - auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto device_tensor = clone_tensor_ptr_to(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to(device_tensor, DeviceType::CPU); auto* result = roundtrip->const_data_ptr(); for (size_t i = 0; i < n; ++i) { diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index 182d0bfd58a..c0877aac924 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -52,7 +52,8 @@ "directory": "extension/tensor/test", "sources": [ "tensor_ptr_maker_test.cpp", - "tensor_ptr_test.cpp" + "tensor_ptr_test.cpp", + "tensor_ptr_device_test.cpp" ], "additional_libs": [ "extension_tensor"