diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl new file mode 100644 index 00000000000..444572708bd --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl @@ -0,0 +1,213 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * conv2d_gemm: GEMM step of im2col-backed conv2d. + * + * Reads the im2col'd input produced by conv2d_im2col.glsl as a 2D matrix + * of shape [M, K_total] (M = H_out * W_out, K_total = Kh*Kw*Cin_padded) + * and writes the conv2d output as texture3D channels-packed + * logical shape [1, C_out, H_out, W_out]. + * + * The im2col input can be any of: + * - texture2d, width-packed: texel at (k4, m) holds 4 K values for row m. + * IN_STORAGE=texture2d codegen. + * - texture3d, channels-packed: texel at (ow, oh, k4) holds 4 K values + * for output spatial position (oh, ow). Used when M would exceed + * max_texture2d_dim. IN_STORAGE=texture3d codegen. + * - buffer: vec4 at offset m*K4 + k4, same K packing. + * IN_STORAGE=buffer codegen. + * + * The matmul interpretation is: + * out[m, n] = sum_k im2col[m, k] * weight[n, k] + bias[n] + * with M = H_out * W_out, K = K_total, N = C_out. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +$if IN_STORAGE == "buffer" and DTYPE == "half": + ${define_explicit_type_extensions(DTYPE)} + +// VEC4_T is the input storage's natural texel type, which is also the tile type +// (the linear_fp_*_tile headers default the tile vec4 type to VEC4_T). For the +// buffer/half path this resolves to f16vec4, so the GEMM inner loop accumulates +// in true FP16 — the fma emits mad.f16 and the accumulators live in half-width +// registers. Texture-sampled half always returns vec4, so FP16 accumulation is +// naturally confined to the buffer (Mali) path; the texture variants (Adreno), +// where FP16 accumulation regresses, stay vec4 / FP32 with no extra gating. +#define VEC4_T ${texel_load_type(DTYPE, IN_STORAGE)} + +// OUT_VEC4_T is the output surface type. t_out is always texture3d, whose +// imageStore ABI takes vec4 (fp32) regardless of DTYPE, so the accumulator tile +// is cast from VEC4_T to OUT_VEC4_T at store time. +#define OUT_VEC4_T ${texel_load_type(DTYPE, "texture3d")} + +#define TILE_M4 ${TILE_M4} +#define TILE_K4 ${TILE_K4} +#define TILE_N4 ${TILE_N4} + +#define TILE_M ${TILE_M} +#define TILE_K ${TILE_K4 * 4} +#define TILE_N ${TILE_N4 * 4} + +$if IN_STORAGE == "buffer": + #define INPUT_BUFFER +$elif IN_STORAGE == "texture3d": + #define INPUT_TEXTURE3D + +${define_required_extensions("texture3d", DTYPE)} +$if IN_STORAGE == "buffer": + ${define_required_extensions("buffer", DTYPE)} + +layout(std430) buffer; + +#include "common.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} +$if IN_STORAGE == "buffer": + ${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer", is_scalar_array=False)} +$else: + ${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE)} +${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "texture2d")} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture2d")} + +${layout_declare_ubo(B, "ivec4", "out_sizes")} + +// Push constants are uploaded in 16-byte chunks (one ivec4 each). +layout(push_constant) uniform restrict Block { + ivec4 gemm_dims; // (K_total, K4_total, M, _unused) + vec4 clamp_vals; // (out_min, out_max, _unused, _unused) +}; + +#define K_TOTAL gemm_dims.x +#define K4_TOTAL gemm_dims.y +#define M_TOTAL gemm_dims.z +#define OUT_MIN clamp_vals.x +#define OUT_MAX clamp_vals.y + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "activation_type", "0")} + +#include "linear_fp_input_tile.glslh" +#include "linear_fp_packed_weight_tile_load.glslh" +#include "linear_fp_output_tile_fp_compute.glslh" + +/* + * Load TILE_M rows × TILE_K4 K-tiles of the im2col'd input. + * The im2col output is a contiguous (M, K_total/4) matrix of vec4s, so the + * load is a plain 2D fetch — no spatial decomposition. + */ +void load_input_tile_with_checks( + out FPInputTile tile, + const int k4_start, + const int m_start, + const int K4, + const int M, + const int W_out) { + // W_out is only consumed by the texture3d variant below. + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { + if (k4_start + k4 < K4 && m_start + m < M) { + const int row = m_start + m; + const int col = k4_start + k4; +#if defined(INPUT_BUFFER) + // Cast SSBO texel into the input tile type (f16vec4 for half, vec4 for + // float). + tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(t_in[row * K4 + col]); +#elif defined(INPUT_TEXTURE3D) + // texture3d layout: row (the flat M index) decomposes into (ow, oh) + // and K4 is along the Z axis. texelFetch returns vec4 (fp32); cast to + // the input tile type. + tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T( + texelFetch(t_in, ivec3(row % W_out, row / W_out, col), 0)); +#else + tile.data[m][k4] = + LINEAR_FP_INPUT_TILE_VEC4_T(texelFetch(t_in, ivec2(col, row), 0)); +#endif + } else { + tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(0.0); + } + } + } +} + +void store_output_tile_with_checks( + const FPOutTile out_tile, + const int n4_start, + const int m_start, + const int N4, + const int M, + const int W_out) { + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + if (m_start + m < M && n4_start + n4 < N4) { + const int spatial = m_start + m; + // Cast the accumulator (f16vec4 for the buffer/half path) to the + // texture3d output surface type for the activation clamp and store. + OUT_VEC4_T texel = OUT_VEC4_T(out_tile.data[m][n4]); + if (activation_type == 1) { + texel = max(texel, OUT_VEC4_T(0.0)); + } else if (activation_type == 2) { + texel = clamp(texel, OUT_VEC4_T(OUT_MIN), OUT_VEC4_T(OUT_MAX)); + } + imageStore( + t_out, ivec3(spatial % W_out, spatial / W_out, n4_start + n4), texel); + } + } + } +} + +void main() { + const int tile_idx_n = int(gl_GlobalInvocationID.x); + const int tile_idx_m = int(gl_GlobalInvocationID.y); + + const int n4_start = tile_idx_n * TILE_N4; + const int m_start = tile_idx_m * TILE_M; + + const int W_out = out_sizes.x; + const int H_out = out_sizes.y; + const int M = M_TOTAL; + const int K4 = K4_TOTAL; + const int N = out_sizes.z; + const int N4 = div_up_4(N); + + if (n4_start >= N4 || m_start >= M) { + return; + } + + FPOutTile out_tile; + initialize(out_tile); + + FPInputTile in_tile; + FPWeightTile w_tile; + + for (int k4 = 0; k4 < K4; k4 += TILE_K4) { + load_input_tile_with_checks(in_tile, k4, m_start, K4, M, W_out); + load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4); + fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile); + } + + // Apply bias. The bias texel depends only on n4, so fetch it once per n4 and + // add it to every m row rather than re-fetching inside the M loop. + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + if (n4_start + n4 < N4) { + // t_bias is an fp32 texture2d; cast its texel to the accumulator type. + const LINEAR_FP_OUTPUT_TILE_VEC4_T bias_texel = + LINEAR_FP_OUTPUT_TILE_VEC4_T( + texelFetch(t_bias, ivec2(n4_start + n4, 0), 0)); + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + out_tile.data[m][n4] += bias_texel; + } + } + } + + store_output_tile_with_checks(out_tile, n4_start, m_start, N4, M, W_out); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml new file mode 100644 index 00000000000..15ec490b130 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_gemm: + parameter_names_with_default_values: + DTYPE: float + IN_STORAGE: texture2d + TILE_M4: 1 + TILE_K4: 1 + TILE_N4: 1 + TILE_M: 4 + generate_variant_forall: + combination: + parameter_names: [IN_STORAGE, DTYPE] + combos: + - parameter_values: [texture2d, float] + - parameter_values: [texture2d, half] + - parameter_values: [texture3d, float] + - parameter_values: [texture3d, half] + - parameter_values: [buffer, float] + - parameter_values: [buffer, half] + shader_variants: + - NAME: conv2d_gemm diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl new file mode 100644 index 00000000000..20d07e3d1f8 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl @@ -0,0 +1,120 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Im2col transformation for FP32 / FP16 conv2d. + * + * The output is a 2D matrix of shape [M, K_total] where + * M = H_out * W_out (number of output spatial positions) + * K_total = Kh * Kw * align_up_4(C_in) (flattened receptive field) + * + * K layout (so a 4-tile in K — one vec4 — holds the same kernel position): + * K = (ki * Kw + kj) * Cin_padded + ci + * + * Three codegen'd storage variants of the output tensor: + * - texture2d, width-packed: texel at (k4, m) holds 4 K values for spatial + * position m. Extents = (K_total/4, M). + * - texture3d, channels-packed: texel at (ow, oh, k4) holds 4 K values + * for output spatial position (oh, ow). Extents = (W_out, H_out, K4). + * Used as a fallback when M would exceed max_texture2d_dim. + * - buffer: vec4 at offset (m * K4 + k4), same K packing. + * + * The caller picks storage per device (Mali → buffer; others → texture2d + * when its 2D extents fit, texture3d when its 3D extents fit, else buffer). + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} + +$if OUT_STORAGE == "buffer": + #define OUTPUT_BUFFER + #define VEC4_BUF_T ${texel_load_type(DTYPE, "buffer")} +$elif OUT_STORAGE == "texture3d": + #define OUTPUT_TEXTURE3D + +${define_required_extensions("texture3d", DTYPE)} +$if OUT_STORAGE == "buffer": + ${define_required_extensions("buffer", DTYPE)} + +layout(std430) buffer; + +$if OUT_STORAGE == "buffer": + ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer", is_scalar_array=False)} +$else: + ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} + +${layout_declare_ubo(B, "ivec4", "in_sizes")} + +// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply +// with the per-entry size limit. +layout(push_constant) uniform restrict Block { + ivec4 kernel_stride; // (Kh, Kw, Sh, Sw) + ivec4 padding_dil; // (Ph, Pw, Dh, Dw) + ivec4 dims; // (Cin_padded, W_out, H_out, K4_total) +}; + +#define KERNEL_H kernel_stride.x +#define KERNEL_W kernel_stride.y +#define STRIDE_H kernel_stride.z +#define STRIDE_W kernel_stride.w +#define PADDING_H padding_dil.x +#define PADDING_W padding_dil.y +#define DILATION_H padding_dil.z +#define DILATION_W padding_dil.w +#define CIN_PADDED dims.x +#define W_OUT dims.y +#define H_OUT dims.z +#define K4_TOTAL dims.w + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const int k4 = int(gl_GlobalInvocationID.x); + const int m = int(gl_GlobalInvocationID.y); + const int M = H_OUT * W_OUT; + + if (k4 >= K4_TOTAL || m >= M) { + return; + } + + const int k_start = k4 * 4; + + // K = (ki * Kw + kj) * Cin_padded + ci ; since Cin_padded % 4 == 0, all 4 + // K values in this texel share the same (ki, kj) and span 4 consecutive + // ci values starting at ci_start. + const int krow_idx = k_start / CIN_PADDED; // ki * Kw + kj + const int ci_start = k_start % CIN_PADDED; + const int kj = krow_idx % KERNEL_W; + const int ki = krow_idx / KERNEL_W; + const int ci_blk = ci_start >> 2; // ci_start / 4 + + // Decompose flat output position m back into (oh, ow). + const int ow = m % W_OUT; + const int oh = m / W_OUT; + + // Compute the input spatial position for this (oh, ow, ki, kj). + const int ih = oh * STRIDE_H - PADDING_H + ki * DILATION_H; + const int iw = ow * STRIDE_W - PADDING_W + kj * DILATION_W; + + VEC4_T out_texel = VEC4_T(0); + if (ih >= 0 && ih < in_sizes.y && iw >= 0 && iw < in_sizes.x) { + out_texel = texelFetch(t_in, ivec3(iw, ih, ci_blk), 0); + } + +#if defined(OUTPUT_BUFFER) + t_out[m * K4_TOTAL + k4] = VEC4_BUF_T(out_texel); +#elif defined(OUTPUT_TEXTURE3D) + imageStore(t_out, ivec3(ow, oh, k4), out_texel); +#else + imageStore(t_out, ivec2(k4, m), out_texel); +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.yaml new file mode 100644 index 00000000000..918d79298dd --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.yaml @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv2d_im2col: + parameter_names_with_default_values: + DTYPE: float + OUT_STORAGE: texture2d + generate_variant_forall: + combination: + parameter_names: [OUT_STORAGE, DTYPE] + combos: + - parameter_values: [texture2d, float] + - parameter_values: [texture2d, half] + - parameter_values: [texture3d, float] + - parameter_values: [texture3d, half] + - parameter_values: [buffer, float] + - parameter_values: [buffer, half] + shader_variants: + - NAME: conv2d_im2col diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.glsl new file mode 100644 index 00000000000..77f34324b4f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.glsl @@ -0,0 +1,120 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define BUF_T ${buffer_scalar_type(BUF_DTYPE)} +#define VEC4_T ${texel_load_type(DTYPE, PACKED_STORAGE)} +#define T ${texel_load_component_type(DTYPE, PACKED_STORAGE)} + +$if PACKED_STORAGE == "buffer": + #define OUTPUT_BUFFER + +#extension GL_EXT_control_flow_attributes : require + +${define_required_extensions("buffer", BUF_DTYPE)} +$if PACKED_STORAGE != "buffer": + ${define_required_extensions(PACKED_STORAGE, DTYPE)} + +layout(std430) buffer; + +#include "common.glslh" + +$if PACKED_STORAGE == "buffer": + ${layout_declare_tensor(B, "w", "t_weight_packed", DTYPE, "buffer", is_scalar_array=False)} +$else: + ${layout_declare_tensor(B, "w", "t_weight_packed", DTYPE, PACKED_STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_src", BUF_DTYPE, "buffer", is_scalar_array=True)} + +// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply +// with the per-entry size limit. +layout(push_constant) uniform restrict Block { + ivec4 dims0; // (N=C_out, K=K_total, C_in, Cin_padded) + ivec4 dims1; // (K_h, K_w, _unused, _unused) +}; + +#define N dims0.x +#define K dims0.y +#define C_IN dims0.z +#define CIN_PADDED dims0.w +#define K_H dims1.x +#define K_W dims1.y + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Packs the ORIGINAL serialized conv2d weight [C_out, C_in, K_h, K_w] +// (PyTorch row-major contiguous) directly into the 4OC x 4IC blocked layout +// that conv2d_gemm.glsl loads via load_packed_weight_tile_with_checks, with no +// CPU-side repack of the serialized data. +// +// The GEMM treats the weight as [N=C_out, K=K_total] with the im2col K-axis +// layout +// k = (ki * K_w + kj) * Cin_padded + ci +// so each 4-tile of K holds 4 consecutive ci for one (ki, kj). Lanes with +// ci >= C_in are zero (Cin padding). +// +// This produces a byte-identical packed tensor to running the generic +// pack_fp_linear_weight (is_transposed=1) over the CPU-flattened [C_out, +// K_total] weight: a 4x4 block is transposed so packed[dk] = {w_flat[n4*4 + +// 0..3][k4*4 + dk]}. + +// Read the flattened weight scalar at logical (n, k) directly from the +// serialized [C_out, C_in, K_h, K_w] buffer, applying the im2col K decode and +// Cin padding. Returns 0 for out-of-range n / padding ci lanes. +T load_flat_weight_scalar(const int n, const int k) { + if (n >= N || k >= K) { + return T(0); + } + const int ci = k % CIN_PADDED; + if (ci >= C_IN) { + return T(0); // Cin padding lane + } + const int krow = k / CIN_PADDED; // ki * K_w + kj + const int kj = krow % K_W; + const int ki = krow / K_W; + // Serialized [C_out, C_in, K_h, K_w] contiguous index. + const int src_idx = ((n * C_IN + ci) * K_H + ki) * K_W + kj; + return T(t_weight_src[src_idx]); +} + +VEC4_T load_flat_weight_row(const int n, const int k_base) { + return VEC4_T( + load_flat_weight_scalar(n, k_base), + load_flat_weight_scalar(n, k_base + 1), + load_flat_weight_scalar(n, k_base + 2), + load_flat_weight_scalar(n, k_base + 3)); +} + +void main() { + const int n4 = int(gl_GlobalInvocationID.x); + const int k4 = int(gl_GlobalInvocationID.y); + + const int K4 = div_up_4(K); + const int N4 = div_up_4(N); + + if (n4 >= N4 || k4 >= K4) { + return; + } + + // Read 4 N-rows at the k4 column block, transpose into a 4OC x 4IC block. + // Mirrors the is_transposed branch of pack_fp_linear_weight. + VEC4_T src_rows[4]; + [[unroll]] for (int dn = 0; dn < 4; dn++) { + src_rows[dn] = load_flat_weight_row(n4 * 4 + dn, k4 * 4); + } + [[unroll]] for (int dk = 0; dk < 4; dk++) { + VEC4_T out_val = VEC4_T( + src_rows[0][dk], src_rows[1][dk], src_rows[2][dk], src_rows[3][dk]); +#ifdef OUTPUT_BUFFER + t_weight_packed[(k4 * N4 + n4) * 4 + dk] = out_val; +#else + imageStore(t_weight_packed, ivec2(n4 * 4 + dk, k4), out_val); +#endif + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.yaml new file mode 100644 index 00000000000..42e0a8ab229 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.yaml @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +pack_conv2d_gemm_weight: + parameter_names_with_default_values: + DTYPE: float + BUF_DTYPE: float + PACKED_STORAGE: texture2d + generate_variant_forall: + combination: + parameter_names: [PACKED_STORAGE, DTYPE, BUF_DTYPE] + combos: + - parameter_values: [texture2d, float, float] + - parameter_values: [texture2d, half, half] + - parameter_values: [texture2d, half, float] + - parameter_values: [buffer, float, float] + - parameter_values: [buffer, half, half] + shader_variants: + - NAME: pack_conv2d_gemm_weight diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.cpp new file mode 100644 index 00000000000..352acbcfb50 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.cpp @@ -0,0 +1,416 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace vkcompute { + +namespace { + +// +// Weight handling +// + +// Prepack the ORIGINAL serialized conv2d weight [C_out, C_in, K_h, K_w] +// directly on the GPU into the 4OC x 4IC blocked layout that conv2d_gemm.glsl +// loads via load_packed_weight_tile_with_checks. The serialized weight data is +// read as-is (never CPU-repacked); pack_conv2d_gemm_weight.glsl performs the +// im2col K-axis reorder (k = (ki * K_w + kj) * Cin_padded + ci, ci-padding +// lanes zeroed) and the 4x4 transpose in one pass. +// +// The packed output is byte-identical to the layout the generic +// prepack_fp_linear_weight (is_transposed=1) produced over a CPU-flattened +// [C_out, K_total] weight, so conv2d_gemm.glsl is unchanged. +ValueRef prepack_conv2d_gemm_weight( + ComputeGraph& graph, + const ValueRef weight_data) { + const std::vector w_sizes = graph.sizes_of(weight_data); + VK_CHECK_COND(w_sizes.size() == 4); + const int64_t C_out = w_sizes[0]; + const int64_t C_in = w_sizes[1]; + const int64_t K_h = w_sizes[2]; + const int64_t K_w = w_sizes[3]; + + const int64_t Cin_padded = utils::align_up_4(C_in); + const int64_t K_total = K_h * K_w * Cin_padded; + + const int64_t N = C_out; + const int64_t K = K_total; + const int64_t N4 = utils::div_up(N, int64_t(4)); + const int64_t K4 = utils::div_up(K, int64_t(4)); + + // Packed tensor: K4 rows, N4*4 vec4 elements per row (4OC x 4IC blocks). + // kWidthPacked packs 4 scalars per texel, so width = N4*4*4 scalars. + const int64_t output_height = K4; + const int64_t output_width = N4 * 4 * 4; + + utils::StorageType weight_storage = utils::kTexture2D; + const uint32_t max_extent = + graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width / 4 > max_extent || + utils::safe_downcast(output_height) > max_extent) { + weight_storage = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + {output_height, output_width}, + graph.dtype_of(weight_data), + weight_storage, + utils::kWidthPacked); + + const utils::uvec3 global_wg_size = { + utils::safe_downcast(N4), + utils::safe_downcast(K4), + 1u}; + + // Push constants must be uploaded in <= 16-byte (one ivec4) chunks; the + // shader's Block reads them back as dims0 / dims1. Layout must match + // pack_conv2d_gemm_weight.glsl. + const utils::ivec4 dims0{ + utils::safe_downcast(N), + utils::safe_downcast(K), + utils::safe_downcast(C_in), + utils::safe_downcast(Cin_padded)}; + const utils::ivec4 dims1{ + utils::safe_downcast(K_h), + utils::safe_downcast(K_w), + 0, + 0}; + + std::string kernel_name = "pack_conv2d_gemm_weight"; + add_storage_type_suffix(kernel_name, weight_storage); + add_dtype_suffix(kernel_name, graph.dtype_of(weight_data)); + add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(weight_data)); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + weight_data, + packed_weight, + {}, + {}, + {PushConstantDataInfo(&dims0, sizeof(dims0)), + PushConstantDataInfo(&dims1, sizeof(dims1))})); + + return packed_weight; +} + +// +// GEMM dispatch +// + +vkapi::ShaderInfo pick_conv2d_gemm_shader( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + // The im2col tensor's storage selects the input-load codegen variant of + // conv2d_gemm: texture2d vs buffer. + const ValueRef im2col_in = args.at(1).refs.at(0); + + std::string kernel_name = "conv2d_gemm"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph->storage_type_of(im2col_in)); + add_dtype_suffix(kernel_name, graph->dtype_of(out)); + return VK_KERNEL_FROM_STR(kernel_name); +} + +utils::uvec3 pick_conv2d_gemm_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + const uint32_t W = graph->size_at(-1, out); + const uint32_t H = graph->size_at(-2, out); + const uint32_t C_out = graph->size_at(-3, out); + const uint32_t M = H * W; + const uint32_t N4 = utils::div_up_4(C_out); + // TILE_N4=1, TILE_M=4 + return {N4, utils::div_up(M, 4u), 1}; +} + +// Output sizes are determined by the conv shape (im2col tensor's spatial +// extents match the conv output), so the GEMM shader doesn't need to resize +// the output tensor — it's already set by the caller. We still need a noop +// resize because the dispatch infra expects one. +void resize_conv2d_gemm_node( + ComputeGraph* /*graph*/, + const std::vector& /*args*/, + const std::vector& /*extra_args*/) { + // no-op +} + +void add_conv2d_gemm_node( + ComputeGraph& graph, + const ValueRef im2col_in, + const ValueRef packed_weight, + const ValueRef packed_bias, + const ValueRef out, + const int32_t K_total, + const int32_t M_total, + const bool clamp_out, + const float out_min_val, + const float out_max_val) { + const int32_t K4_total = K_total / 4; + + const utils::ivec4 gemm_dims{K_total, K4_total, M_total, 0}; + const utils::vec4 clamp_vals{out_min_val, out_max_val, 0.0f, 0.0f}; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + pick_conv2d_gemm_shader, + pick_conv2d_gemm_global_wg_size, + pick_hw_square_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, + {{im2col_in, packed_weight, packed_bias}, vkapi::kRead}}, + // Shader params buffers + {graph.sizes_ubo(out)}, + // Push constants (2 × 16 bytes) + {PushConstantDataInfo(&gemm_dims, sizeof(gemm_dims)), + PushConstantDataInfo(&clamp_vals, sizeof(clamp_vals))}, + // Specialization constants + // activation_type: 0=none, 1=relu, 2=clamp + {clamp_out ? 2 : 0}, + // Resize args + {}, + // Resizing logic + resize_conv2d_gemm_node)); +} + +} // namespace + +// +// Orchestration +// + +void conv2d_gemm_impl( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef out, + const bool clamp_out, + const float out_min_val, + const float out_max_val, + const std::optional im2col_storage_override) { + const std::vector in_sizes = graph.sizes_of(in); + const std::vector w_sizes = graph.sizes_of(weight_data); + const std::vector out_sizes = graph.sizes_of(out); + VK_CHECK_COND(in_sizes.size() == 4 && in_sizes[0] == 1); + VK_CHECK_COND(w_sizes.size() == 4); + + const int64_t C_in = w_sizes[1]; + const int64_t K_h = w_sizes[2]; + const int64_t K_w = w_sizes[3]; + const int64_t H_out = out_sizes[2]; + const int64_t W_out = out_sizes[3]; + + const int64_t Cin_padded = utils::align_up_4(C_in); + const int64_t K_total = K_h * K_w * Cin_padded; + // Cin_padded is align_up_4(C_in), so K_total is a multiple of 4 and the + // K4_total = K_total / 4 division below is exact. + VK_CHECK_COND(K_total % 4 == 0); + + // Extract scalar conv params, scoping the IntListPtrs so they don't keep + // active value pointers around while we mutate the graph below. + int32_t stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w; + { + const auto stride_list = graph.get_int_list(stride); + const auto padding_list = graph.get_int_list(padding); + const auto dilation_list = graph.get_int_list(dilation); + stride_h = utils::safe_downcast(stride_list->at(0)); + stride_w = utils::safe_downcast(stride_list->at(1)); + padding_h = utils::safe_downcast(padding_list->at(0)); + padding_w = utils::safe_downcast(padding_list->at(1)); + dilation_h = utils::safe_downcast(dilation_list->at(0)); + dilation_w = utils::safe_downcast(dilation_list->at(1)); + } + + const int64_t M = H_out * W_out; + const int64_t K4_total = K_total / 4; + + // Pick im2col storage. When an explicit override is provided (test-only), + // honor it and skip auto-selection. Otherwise run the production + // auto-selection per device: + // - Mali: always buffer (texture sampling on Mali is comparatively slow). + // - Others: prefer texture2d (M × K4_total). If that doesn't fit the + // device's max texture2d dim, fall back to texture3d laid out as + // (W_out, H_out, K4_total). Buffer is the last-resort fallback. + utils::StorageType im2col_storage; + if (im2col_storage_override.has_value()) { + im2col_storage = im2col_storage_override.value(); + VK_CHECK_COND( + im2col_storage == utils::kBuffer || + im2col_storage == utils::kTexture2D || + im2col_storage == utils::kTexture3D); + } else if (graph.device_is_mali()) { + im2col_storage = utils::kBuffer; + } else { + const uint32_t max_2d = graph.context()->adapter_ptr()->max_texture2d_dim(); + const uint32_t max_3d = graph.context()->adapter_ptr()->max_texture3d_dim(); + const bool fits_2d = utils::safe_downcast(K4_total) <= max_2d && + utils::safe_downcast(M) <= max_2d; + const bool fits_3d = utils::safe_downcast(W_out) <= max_3d && + utils::safe_downcast(H_out) <= max_3d && + utils::safe_downcast(K4_total) <= max_3d; + if (fits_2d) { + im2col_storage = utils::kTexture2D; + } else if (fits_3d) { + im2col_storage = utils::kTexture3D; + } else { + im2col_storage = utils::kBuffer; + } + } + + // Allocate the im2col intermediate as a scoped scratch tensor. The im2col + // value is produced by the im2col node and consumed immediately by the GEMM + // node, both below, and is dead afterwards. Using a TmpTensor lets the memory + // planner alias one backing buffer across the (non-overlapping) im2col + // lifetimes of every conv2d layer, so peak memory tracks the largest single + // im2col rather than the sum of all of them. The TmpTensor must outlive + // add_conv2d_gemm_node (its last consumer), so it lives to the end of this + // function. + // + // The 2D and buffer variants use a flat [M, K_total] kWidthPacked shape; the + // texture3d variant uses the natural [1, K_total, H_out, W_out] + // kChannelsPacked shape so K4 lays along Z. Hoist the per-storage differences + // into locals so the TmpTensor is constructed exactly once and never needs to + // be copied or moved. + std::vector im2col_sizes; + utils::StorageType im2col_tmp_storage; + utils::GPUMemoryLayout im2col_layout; + if (im2col_storage == utils::kTexture3D) { + im2col_sizes = {1, K_total, H_out, W_out}; + im2col_tmp_storage = utils::kTexture3D; + im2col_layout = utils::kChannelsPacked; + } else { + im2col_sizes = {M, K_total}; + im2col_tmp_storage = im2col_storage; + im2col_layout = utils::kWidthPacked; + } + TmpTensor im2col_tmp( + &graph, + im2col_sizes, + graph.dtype_of(in), + im2col_tmp_storage, + im2col_layout); + const ValueRef im2col_tensor = im2col_tmp.vref; + + // Step 1: im2col + add_conv2d_im2col_node( + graph, + in, + im2col_tensor, + utils::safe_downcast(K_h), + utils::safe_downcast(K_w), + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + utils::safe_downcast(Cin_padded), + utils::safe_downcast(H_out), + utils::safe_downcast(W_out)); + + // Step 2: prepack weight for the GEMM directly from the serialized + // [C_out, C_in, K_h, K_w] weight on the GPU. The serialized data is read + // as-is (never CPU-repacked); the prepack shader does the im2col K-axis + // reorder + 4x4 transpose into the layout conv2d_gemm.glsl loads via + // load_packed_weight_tile_with_checks. + ValueRef packed_weight = prepack_conv2d_gemm_weight(graph, weight_data); + + // Bias prepack: matches the bias format conv2d_gemm expects. prepack_biases + // only reads dim 0 (= C_out) of the weight, so the original 4D weight works + // directly. + ValueRef packed_bias = prepack_biases( + graph, + bias, + weight_data, + /*transposed=*/false, + utils::kTexture2D, + utils::kWidthPacked); + + check_conv_args(graph, in, out); + + // Step 3: GEMM + add_conv2d_gemm_node( + graph, + im2col_tensor, + packed_weight, + packed_bias, + out, + utils::safe_downcast(K_total), + utils::safe_downcast(M), + clamp_out, + out_min_val, + out_max_val); +} + +// +// Op registration — matches aten.convolution.default's 10-arg signature: +// in, weight, bias, stride, padding, dilation, transposed, +// output_padding, groups, out +// +// Only the conv2d non-transposed, groups=1 case is supported. + +void conv2d_gemm_op(ComputeGraph& graph, const std::vector& args) { + VK_CHECK_COND(args.size() == 10); + const ValueRef in = args[0]; + const ValueRef weight = args[1]; + const ValueRef bias = args[2]; + const ValueRef stride = args[3]; + const ValueRef padding = args[4]; + const ValueRef dilation = args[5]; + const ValueRef transposed = args[6]; + const ValueRef /*output_padding*/ _output_padding = args[7]; + (void)_output_padding; + const ValueRef groups = args[8]; + const ValueRef out = args[9]; + + VK_CHECK_COND(graph.get_bool(transposed) == false); + VK_CHECK_COND(graph.get_int(groups) == 1); + + conv2d_gemm_impl( + graph, + in, + weight, + bias, + stride, + padding, + dilation, + out, + /*clamp_out=*/false, + /*out_min_val=*/0.0f, + /*out_max_val=*/0.0f); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.conv2d_gemm.default, conv2d_gemm_op); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h new file mode 100644 index 00000000000..73e95887266 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace vkcompute { + +/* + * End-to-end orchestration for an FP32 / FP16 conv2d computed as + * im2col -> GEMM. The dataflow is: + * + * t_in [1, C_in, H_in, W_in] + * │ + * ▼ (add_conv2d_im2col_node, conv2d_im2col.glsl) + * im2col [1, K_total, H_out, W_out] K_total = Kh * Kw * align_up_4(C_in) + * │ + * │ + flattened weight [C_out, K_total] (built CPU-side from + * │ [C_out, C_in, Kh, Kw] with the ci ↔ (ki, kj) transpose) + * ▼ (add_conv2d_gemm_node, conv2d_gemm.glsl) + * t_out [1, C_out, H_out, W_out] + * + * This function performs both dispatch and prepack registration. The im2col + * intermediate is allocated as a graph tensor; the flattened weight is + * registered as a new TensorRef owned by the graph. + * + * Constraints (asserted internally): + * - input batch == 1 + * - weight rank == 4 + * - groups == 1 (general grouped conv not yet supported) + * - transposed == false + * + * `im2col_storage_override` controls the storage type of the im2col + * intermediate tensor (and, by extension, the conv2d_gemm input-load variant): + * - std::nullopt (default): the production path. Storage is auto-selected + * from device characteristics and texture-extent limits — byte-for-byte + * the same selection used by the registered op. + * - a concrete StorageType: force that storage, skipping auto-selection. + * Used by tests to exercise each storage variant deterministically and + * independently of the device. Must be one of kBuffer / kTexture2D / + * kTexture3D. + */ +void conv2d_gemm_impl( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef out, + const bool clamp_out = false, + const float out_min_val = 0.0f, + const float out_max_val = 0.0f, + const std::optional im2col_storage_override = + std::nullopt); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.cpp new file mode 100644 index 00000000000..150275b5ac4 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace vkcompute { + +// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply +// with the per-entry size limit. Layout matches conv2d_im2col.glsl: +// { ivec4 kernel_stride, ivec4 padding_dil, ivec4 dims } + +void add_conv2d_im2col_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef im2col_out, + const int32_t kernel_h, + const int32_t kernel_w, + const int32_t stride_h, + const int32_t stride_w, + const int32_t padding_h, + const int32_t padding_w, + const int32_t dilation_h, + const int32_t dilation_w, + const int32_t Cin_padded, + const int32_t H_out, + const int32_t W_out) { + const utils::StorageType out_storage = graph.storage_type_of(im2col_out); + VK_CHECK_COND( + out_storage == utils::kBuffer || out_storage == utils::kTexture2D || + out_storage == utils::kTexture3D); + + std::string kernel_name = "conv2d_im2col"; + add_storage_type_suffix(kernel_name, out_storage); + add_dtype_suffix(kernel_name, graph.dtype_of(im2col_out)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); + + const int32_t M = H_out * W_out; + // K_total is laid out so that 4-tiles share a kernel position; since + // Cin_padded is a multiple of 4, K_total is also a multiple of 4. + const int32_t K_total = kernel_h * kernel_w * Cin_padded; + const int32_t K4_total = K_total / 4; + + const utils::ivec4 kernel_stride{kernel_h, kernel_w, stride_h, stride_w}; + const utils::ivec4 padding_dil{padding_h, padding_w, dilation_h, dilation_w}; + const utils::ivec4 dims{Cin_padded, W_out, H_out, K4_total}; + + // Global wg: one thread per (k4, m) vec4 in the output. + const utils::uvec3 global_wg_size{ + utils::safe_downcast(K4_total), + utils::safe_downcast(M), + 1u}; + const utils::uvec3 local_wg_size{16u, 4u, 1u}; + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{im2col_out, vkapi::kWrite}, {in, vkapi::kRead}}, + // UBOs + {graph.sizes_ubo(in)}, + // Push constants (3 × ivec4 = 48 bytes, split per 16-byte limit) + {PushConstantDataInfo(&kernel_stride, sizeof(kernel_stride)), + PushConstantDataInfo(&padding_dil, sizeof(padding_dil)), + PushConstantDataInfo(&dims, sizeof(dims))}, + // Specialization constants + {}, + // Resize args + {}, + // Resizing logic + nullptr)); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h new file mode 100644 index 00000000000..8821db181bd --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +/* + * Dispatch a single im2col transformation node for an FP32 / FP16 conv2d. + * + * Produces a 2D tensor of logical shape + * [M, K_total] + * where + * M = H_out * W_out + * K_total = kernel_h * kernel_w * align_up_4(C_in) + * + * The K dimension is laid out so that consecutive 4-tiles of K hold 4 + * consecutive ci values for the same (ki, kj) kernel position. This is the + * layout `conv2d_gemm` consumes for the GEMM step. + * + * The im2col output tensor's storage type (texture2d width-packed or + * buffer) is determined by the caller; this function picks the matching + * shader variant based on `graph.storage_type_of(im2col_out)`. + * + * Inputs: + * in : input texture3D channels-packed [1, C_in, H_in, W_in] + * im2col_out : output 2D tensor [M, K_total] (caller allocates), + * storage = texture2d (kWidthPacked) or buffer + * kernel_h/w : conv kernel dimensions + * stride_* : conv strides + * padding_* : conv paddings + * dilation_* : conv dilations + * Cin_padded : align_up_4(C_in) + * H_out, W_out: output spatial extents + */ +void add_conv2d_im2col_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef im2col_out, + const int32_t kernel_h, + const int32_t kernel_w, + const int32_t stride_h, + const int32_t stride_w, + const int32_t padding_h, + const int32_t padding_w, + const int32_t dilation_h, + const int32_t dilation_w, + const int32_t Cin_padded, + const int32_t H_out, + const int32_t W_out); + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2d.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2d.cpp new file mode 100644 index 00000000000..8949276740c --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestConv2d.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include + +namespace vkcompute { + +void test_conv2d(ComputeGraph& graph, const std::vector& args) { + // args[0] = input [N, C_in, H, W] + // args[1] = weight [C_out, C_in, K_h, K_w] (constant) + // args[2] = bias (constant, or none) + // args[3] = stride_h (int) + // args[4] = stride_w (int) + // args[5] = padding_h (int) + // args[6] = padding_w (int) + // args[7] = dilation_h (int) + // args[8] = dilation_w (int) + // args[9] = impl_selector (string) + // args[10] = output [N, C_out, H_out, W_out] + // + // impl_selector grammar: + // "" -> aten.convolution.default (direct sliding-window) + // "im2col" -> et_vk.conv2d_gemm.default, auto im2col storage + // "im2col_buffer"-> im2col/GEMM, force buffer im2col intermediate + // "im2col_tex2d" -> im2col/GEMM, force texture2d im2col intermediate + // "im2col_tex3d" -> im2col/GEMM, force texture3d im2col intermediate + const ValueRef input = args.at(0); + const ValueRef weight = args.at(1); + const ValueRef bias = args.at(2); + const int64_t stride_h = graph.extract_scalar(args.at(3)); + const int64_t stride_w = graph.extract_scalar(args.at(4)); + const int64_t padding_h = graph.extract_scalar(args.at(5)); + const int64_t padding_w = graph.extract_scalar(args.at(6)); + const int64_t dilation_h = graph.extract_scalar(args.at(7)); + const int64_t dilation_w = graph.extract_scalar(args.at(8)); + const std::string impl_selector = graph.extract_string(args.at(9)); + const ValueRef out = args.at(10); + + ValueRef stride = + graph.add_scalar_list(std::vector{stride_h, stride_w}); + ValueRef padding = graph.add_scalar_list( + std::vector{padding_h, padding_w}); + ValueRef dilation = graph.add_scalar_list( + std::vector{dilation_h, dilation_w}); + + // The forced-storage variants must reach conv2d_gemm_impl with the override, + // which the registered op (et_vk.conv2d_gemm.default) cannot express since it + // always auto-selects. Route those directly to conv2d_gemm_impl; the auto + // ("im2col") and direct ("") paths stay on the registered-op dispatch. + std::optional im2col_storage_override; + if (impl_selector == "im2col_buffer") { + im2col_storage_override = utils::kBuffer; + } else if (impl_selector == "im2col_tex2d") { + im2col_storage_override = utils::kTexture2D; + } else if (impl_selector == "im2col_tex3d") { + im2col_storage_override = utils::kTexture3D; + } + + if (im2col_storage_override.has_value()) { + conv2d_gemm_impl( + graph, + input, + weight, + bias, + stride, + padding, + dilation, + out, + /*clamp_out=*/false, + /*out_min_val=*/0.0f, + /*out_max_val=*/0.0f, + im2col_storage_override); + return; + } + + ValueRef transposed = graph.add_scalar(false); + ValueRef output_padding = + graph.add_scalar_list(std::vector{0, 0}); + ValueRef groups = graph.add_scalar(1); + + const std::string target_op = (impl_selector == "im2col") + ? "et_vk.conv2d_gemm.default" + : "aten.convolution.default"; + + VK_GET_OP_FN(target_op.c_str()) + (graph, + {input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + out}); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_conv2d.default, test_conv2d); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index f9501eeb424..7ff7b6ec426 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -101,6 +101,7 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("test_q8ta_pixel_shuffle") define_custom_op_test_binary("test_q8ta_unary") define_custom_op_test_binary("test_mm") + define_custom_op_test_binary("test_conv2d") define_custom_op_test_binary("test_conv2d_pw") define_custom_op_test_binary("test_conv2d_dw") define_custom_op_test_binary("test_embedding_q4gsw") diff --git a/backends/vulkan/test/custom_ops/test_conv2d.cpp b/backends/vulkan/test/custom_ops/test_conv2d.cpp new file mode 100644 index 00000000000..f56a2d81407 --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_conv2d.cpp @@ -0,0 +1,576 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include "conv2d_utils.h" +#include "utils.h" + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 64; + +struct InputDims { + int64_t N; + int64_t C; + int64_t H; + int64_t W; + + InputDims(int64_t n, int64_t c, int64_t h, int64_t w) + : N(n), C(c), H(h), W(w) {} +}; + +struct Conv2dTestConfig { + InputDims dims; + int64_t C_out; + KernelSize kernel; + Stride stride; + Padding padding; + Dilation dilation; + bool has_bias; +}; + +static int64_t calc_out_size( + int64_t in_size, + int64_t kernel_size, + int64_t stride, + int64_t padding, + int64_t dilation) { + return (in_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + + 1; +} + +// Shared perf/skip classification used by both create_conv2d_test_case (to tag +// PERF vs ACCU) and conv2d_reference_impl (to gate the large-K FP16 reference +// check). A shape is "perf" if any dimension reaches kRefDimSizeLimit; the +// boundary is inclusive (>=) so a 64-wide dim counts as perf — FP16 +// accumulation error at K = K_h * K_w * C_in for such shapes can exceed the +// half tolerance and false-fail. Keep both call sites on this single helper to +// avoid the two predicates drifting apart. +static bool +conv2d_is_perf_shape(int64_t C_in, int64_t C_out, int64_t H, int64_t W) { + return C_in >= kRefDimSizeLimit || C_out >= kRefDimSizeLimit || + H >= kRefDimSizeLimit || W >= kRefDimSizeLimit; +} + +static TestCase create_conv2d_test_case( + const Conv2dTestConfig& config, + vkapi::ScalarType dtype, + utils::StorageType storage_type, + utils::GPUMemoryLayout memory_layout, + const std::string& impl_selector = "") { + TestCase test_case; + + bool is_perf = conv2d_is_perf_shape( + config.dims.C, config.C_out, config.dims.H, config.dims.W); + + std::string prefix = is_perf ? "PERF" : "ACCU"; + std::string storage_str = repr_str(storage_type, memory_layout); + std::string dtype_str = dtype_short(dtype); + std::string bias_str = config.has_bias ? "+bias" : ""; + + int64_t H_out = calc_out_size( + config.dims.H, + config.kernel.h, + config.stride.h, + config.padding.h, + config.dilation.h); + int64_t W_out = calc_out_size( + config.dims.W, + config.kernel.w, + config.stride.w, + config.padding.w, + config.dilation.w); + + std::string shape = "[" + std::to_string(config.dims.N) + "," + + std::to_string(config.dims.C) + "," + std::to_string(config.dims.H) + + "," + std::to_string(config.dims.W) + "]->[" + + std::to_string(config.C_out) + "] k" + std::to_string(config.kernel.h) + + "x" + std::to_string(config.kernel.w) + " s" + + std::to_string(config.stride.h) + " p" + + std::to_string(config.padding.h) + " d" + + std::to_string(config.dilation.h); + + std::string suffix = bias_str; + if (!impl_selector.empty()) { + if (!suffix.empty()) { + suffix += " "; + } + suffix += "[" + impl_selector + "]"; + } + + std::string name = + make_test_label(prefix, dtype_str, dtype_str, shape, storage_str, suffix); + + test_case.set_name(name); + test_case.set_operator_name("test_etvk.test_conv2d.default"); + + // Input tensor [N, C_in, H, W] + ValueSpec input( + {config.dims.N, config.dims.C, config.dims.H, config.dims.W}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + + // Weight tensor [C_out, C_in, K_h, K_w] - constant + ValueSpec weight( + {config.C_out, config.dims.C, config.kernel.h, config.kernel.w}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + weight.set_constant(true); + + test_case.add_input_spec(input); + test_case.add_input_spec(weight); + + // Bias (or none) + if (config.has_bias) { + ValueSpec bias( + {config.C_out}, + dtype, + storage_type, + memory_layout, + DataGenType::RANDOM); + bias.set_constant(true); + test_case.add_input_spec(bias); + } else { + ValueSpec none_bias(static_cast(0)); + none_bias.set_none(true); + test_case.add_input_spec(none_bias); + } + + // stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w + test_case.add_input_spec(ValueSpec(static_cast(config.stride.h))); + test_case.add_input_spec(ValueSpec(static_cast(config.stride.w))); + test_case.add_input_spec(ValueSpec(static_cast(config.padding.h))); + test_case.add_input_spec(ValueSpec(static_cast(config.padding.w))); + test_case.add_input_spec(ValueSpec(static_cast(config.dilation.h))); + test_case.add_input_spec(ValueSpec(static_cast(config.dilation.w))); + + // impl_selector string + test_case.add_input_spec(ValueSpec::make_string(impl_selector)); + + // Output tensor [N, C_out, H_out, W_out] + ValueSpec output( + {config.dims.N, config.C_out, H_out, W_out}, + dtype, + storage_type, + memory_layout, + DataGenType::ZEROS); + test_case.add_output_spec(output); + + if (dtype == vkapi::kHalf) { + test_case.set_abs_tolerance(1e-1f); + test_case.set_rel_tolerance(1e-2f); + } else { + test_case.set_abs_tolerance(1e-3f); + test_case.set_rel_tolerance(1e-3f); + } + + test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"}); + + return test_case; +} + +// Reference implementation for general conv2d (groups=1). +// +// Supports both FP32 and (small-shape) FP16 inputs. The math is always done in +// float; for FP16 the master input/weight/bias values are dequantized from +// their half storage via get_element(), and the resulting float reference is +// compared against the dequantized GPU output by validate_against_reference(). +// +// FP16 accumulation error grows with K (= K_h * K_w * C_in). For large-K PERF +// shapes the FP32 reference would diverge from the GPU's FP16 accumulation +// enough to trip even the relaxed half tolerance, producing false failures, so +// those are intentionally left timing-only: this function throws +// std::invalid_argument, which execute_test_cases() catches to skip the +// correctness check (ref_computed stays false) while still benchmarking. +static void conv2d_reference_impl(TestCase& test_case) { + const ValueSpec& input = test_case.inputs()[0]; + const ValueSpec& weight = test_case.inputs()[1]; + const ValueSpec& bias_spec = test_case.inputs()[2]; + ValueSpec& output = test_case.outputs()[0]; + + if (input.dtype != vkapi::kFloat && input.dtype != vkapi::kHalf) { + throw std::invalid_argument("Reference only supports float and half"); + } + + auto input_sizes = input.get_tensor_sizes(); + auto weight_sizes = weight.get_tensor_sizes(); + auto output_sizes = output.get_tensor_sizes(); + + int64_t N = input_sizes[0]; + int64_t C_in = input_sizes[1]; + int64_t H_in = input_sizes[2]; + int64_t W_in = input_sizes[3]; + int64_t C_out = weight_sizes[0]; + int64_t K_h = weight_sizes[2]; + int64_t K_w = weight_sizes[3]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + + // For FP16, only compute a reference for small (ACCU) shapes where K is small + // enough that FP32-vs-FP16 accumulation error stays within the half + // tolerance. Large-K PERF half shapes stay timing-only via the throw below. + // The predicate mirrors create_conv2d_test_case's is_perf classification. + if (input.dtype == vkapi::kHalf) { + const bool is_perf = conv2d_is_perf_shape(C_in, C_out, H_in, W_in); + if (is_perf) { + throw std::invalid_argument( + "Half reference skipped for large-K PERF shape (timing-only)"); + } + } + + int64_t stride_h = test_case.inputs()[3].get_int_value(); + int64_t stride_w = test_case.inputs()[4].get_int_value(); + int64_t padding_h = test_case.inputs()[5].get_int_value(); + int64_t padding_w = test_case.inputs()[6].get_int_value(); + int64_t dilation_h = test_case.inputs()[7].get_int_value(); + int64_t dilation_w = test_case.inputs()[8].get_int_value(); + + // get_element() materializes a float regardless of dtype (it dequantizes + // half master data), so the same loop body serves both FP32 and FP16. + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(N * C_out * H_out * W_out, 0.0f); + + for (int64_t n = 0; n < N; ++n) { + for (int64_t co = 0; co < C_out; ++co) { + for (int64_t oh = 0; oh < H_out; ++oh) { + for (int64_t ow = 0; ow < W_out; ++ow) { + float sum = 0.0f; + for (int64_t ci = 0; ci < C_in; ++ci) { + for (int64_t kh = 0; kh < K_h; ++kh) { + for (int64_t kw = 0; kw < K_w; ++kw) { + int64_t ih = oh * stride_h - padding_h + kh * dilation_h; + int64_t iw = ow * stride_w - padding_w + kw * dilation_w; + if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) { + float in_val = input.get_element( + n * (C_in * H_in * W_in) + ci * (H_in * W_in) + + ih * W_in + iw); + // weight is [C_out, C_in, K_h, K_w] + float w_val = weight.get_element( + co * (C_in * K_h * K_w) + ci * (K_h * K_w) + kh * K_w + + kw); + sum += in_val * w_val; + } + } + } + } + if (!bias_spec.is_none()) { + sum += bias_spec.get_element(co); + } + ref_data + [n * (C_out * H_out * W_out) + co * (H_out * W_out) + oh * W_out + + ow] = sum; + } + } + } + } +} + +static std::vector generate_conv2d_test_cases() { + std::vector test_cases; + + std::vector storage_types = {utils::kTexture3D}; + utils::GPUMemoryLayout layout = utils::kChannelsPacked; + + // Accuracy shapes (small enough for float reference validation) + std::vector accuracy_configs = { + // 3x3 stride=1 pad=1 same-channels (the bottleneck pattern in TinyCNN) + {InputDims(1, 8, 8, 8), + 8, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 8, 8, 8), + 8, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + {InputDims(1, 16, 16, 16), + 16, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // 3x3 stride=2 (downsample) with channel expansion + {InputDims(1, 8, 16, 16), + 16, + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + // 3x3 stride=1 with channel reduction + {InputDims(1, 16, 8, 8), + 8, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + // Non-multiple-of-4 channels + {InputDims(1, 11, 8, 8), + 13, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + // 3-channel input (like RGB stem) + {InputDims(1, 3, 16, 16), + 8, + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + }; + + // TinyCNN depth estimator hotspots (from profiling + // UNTRAINED_TinyCNNDepthEstimatorRealTime_Vulkan.pte). + // Each entry lists (C_in, H, W) -> C_out, all 3x3 stride=1 pad=1 unless + // noted. Together the first 6 entries account for ~89% of all conv time. + std::vector perf_configs = { + // #1: 21.25% — (1,128,36,48)->(1,128,36,48) + {InputDims(1, 128, 36, 48), + 128, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // #2: 20.68% — (1,256,18,24)->(1,256,18,24) + {InputDims(1, 256, 18, 24), + 256, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // #3: 20.01% — (1,64,72,96)->(1,64,72,96) + {InputDims(1, 64, 72, 96), + 64, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // #4: 13.25% — (1,32,144,192)->(1,32,144,192) + {InputDims(1, 32, 144, 192), + 32, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // #5: 6.74% — (1,64,36,48)->(1,64,36,48) + {InputDims(1, 64, 36, 48), + 64, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // #6: 5.90% — (1,32,72,96)->(1,32,72,96) + {InputDims(1, 32, 72, 96), + 32, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // Secondary cases + // 3x3 stride=2 downsample with channel expansion: 1.52% + {InputDims(1, 32, 72, 96), + 128, + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + // 3x3 stride=1 same-shape, smaller spatial: 1.51% + {InputDims(1, 128, 18, 24), + 128, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // 3x3 stride=1, channel reduction + {InputDims(1, 128, 18, 24), + 64, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 64, 36, 48), + 32, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + // 3x3 stride=2 downsample, same channels + {InputDims(1, 32, 72, 96), + 32, + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + {InputDims(1, 64, 36, 48), + 64, + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + // RGB stem + {InputDims(1, 3, 144, 192), + 32, + KernelSize(3, 3), + Stride(2, 2), + Padding(1, 1), + Dilation(1, 1), + false}, + }; + + // Small shapes used to exercise each im2col intermediate-storage variant + // (buffer / texture2d / texture3d) deterministically and independently of + // the device's auto-selection. All dims <= kRefDimSizeLimit so the float + // reference validates them. For the texture3d case the im2col intermediate + // is the channels-packed [1, K_total, H_out, W_out] = [1, 144, 16, 16] for + // the 16x16 shape — tiny, so it always fits texture3d even on the small + // shape (texture3d would never be naturally selected for a small shape). + std::vector per_variant_configs = { + // 3x3 s1 p1, channels multiple of 4 + {InputDims(1, 16, 16, 16), + 16, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + true}, + // Non-multiple-of-4 channels exercise the Cin padding path + {InputDims(1, 11, 12, 12), + 13, + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + false}, + }; + + // Two implementation variants: direct sliding-window (default) and im2col. + const std::vector impls = {"", "im2col"}; + // Forced-storage im2col variants for the per-variant ACCU coverage. + const std::vector forced_storage_impls = { + "im2col_buffer", "im2col_tex2d", "im2col_tex3d"}; + + // Generate accuracy test cases for both impls and both dtypes. FP16 small + // shapes get a real reference check (gated in conv2d_reference_impl); we run + // both dtypes so we catch correctness regressions in either path. Large-K + // half stays timing-only via the reference's PERF-shape throw. + const std::vector accu_dtypes = { + vkapi::kFloat, vkapi::kHalf}; + for (const auto& config : accuracy_configs) { + for (auto st : storage_types) { + for (auto dtype : accu_dtypes) { + for (const auto& impl : impls) { + test_cases.push_back( + create_conv2d_test_case(config, dtype, st, layout, impl)); + } + } + } + } + + // Generate per-variant forced-storage ACCU cases (FP32 and FP16) so all + // three im2col intermediate-storage variants get deterministic, + // device-independent, reference-checked coverage at small K. + for (const auto& config : per_variant_configs) { + for (auto st : storage_types) { + for (auto dtype : accu_dtypes) { + for (const auto& impl : forced_storage_impls) { + test_cases.push_back( + create_conv2d_test_case(config, dtype, st, layout, impl)); + } + } + } + } + + // Generate performance test cases (float and half) for both impls. + for (const auto& config : perf_configs) { + std::vector dtypes = {vkapi::kFloat, vkapi::kHalf}; + for (auto dtype : dtypes) { + for (auto st : storage_types) { + for (const auto& impl : impls) { + test_cases.push_back( + create_conv2d_test_case(config, dtype, st, layout, impl)); + } + } + } + } + + return test_cases; +} + +static int64_t conv2d_flop_calculator(const TestCase& test_case) { + auto input_sizes = test_case.inputs()[0].get_tensor_sizes(); + auto weight_sizes = test_case.inputs()[1].get_tensor_sizes(); + auto output_sizes = test_case.outputs()[0].get_tensor_sizes(); + + int64_t N = output_sizes[0]; + int64_t C_out = output_sizes[1]; + int64_t H_out = output_sizes[2]; + int64_t W_out = output_sizes[3]; + int64_t C_in = input_sizes[1]; + int64_t K_h = weight_sizes[2]; + int64_t K_w = weight_sizes[3]; + + return 2 * N * C_out * C_in * H_out * W_out * K_h * K_w; +} + +static void reference_impl(TestCase& test_case) { + conv2d_reference_impl(test_case); +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "General Conv2d (SlidingWindow, groups=1) Benchmark" + << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = reference_impl; + + execute_test_cases( + generate_conv2d_test_cases, + conv2d_flop_calculator, + "Conv2d", + /*warmup_runs = */ 5, + /*benchmark_runs = */ 20, + ref_fn); + + return 0; +}