diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl
new file mode 100644
index 00000000000..444572708bd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.glsl
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * conv2d_gemm: GEMM step of im2col-backed conv2d.
+ *
+ * Reads the im2col'd input produced by conv2d_im2col.glsl as a 2D matrix
+ * of shape [M, K_total] (M = H_out * W_out, K_total = Kh*Kw*Cin_padded)
+ * and writes the conv2d output as texture3D channels-packed
+ *   logical shape [1, C_out, H_out, W_out].
+ *
+ * The im2col input can be any of:
+ *   - texture2d, width-packed: texel at (k4, m) holds 4 K values for row m.
+ *     IN_STORAGE=texture2d codegen.
+ *   - texture3d, channels-packed: texel at (ow, oh, k4) holds 4 K values
+ *     for output spatial position (oh, ow).  Used when M would exceed
+ *     max_texture2d_dim.  IN_STORAGE=texture3d codegen.
+ *   - buffer: vec4 at offset m*K4 + k4, same K packing.
+ *     IN_STORAGE=buffer codegen.
+ *
+ * The matmul interpretation is:
+ *   out[m, n] = sum_k im2col[m, k] * weight[n, k] + bias[n]
+ * with M = H_out * W_out, K = K_total, N = C_out.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+$if IN_STORAGE == "buffer" and DTYPE == "half":
+  ${define_explicit_type_extensions(DTYPE)}
+
+// VEC4_T is the input storage's natural texel type, which is also the tile type
+// (the linear_fp_*_tile headers default the tile vec4 type to VEC4_T). For the
+// buffer/half path this resolves to f16vec4, so the GEMM inner loop accumulates
+// in true FP16 — the fma emits mad.f16 and the accumulators live in half-width
+// registers. Texture-sampled half always returns vec4, so FP16 accumulation is
+// naturally confined to the buffer (Mali) path; the texture variants (Adreno),
+// where FP16 accumulation regresses, stay vec4 / FP32 with no extra gating.
+#define VEC4_T ${texel_load_type(DTYPE, IN_STORAGE)}
+
+// OUT_VEC4_T is the output surface type. t_out is always texture3d, whose
+// imageStore ABI takes vec4 (fp32) regardless of DTYPE, so the accumulator tile
+// is cast from VEC4_T to OUT_VEC4_T at store time.
+#define OUT_VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+#define TILE_M4 ${TILE_M4}
+#define TILE_K4 ${TILE_K4}
+#define TILE_N4 ${TILE_N4}
+
+#define TILE_M ${TILE_M}
+#define TILE_K ${TILE_K4 * 4}
+#define TILE_N ${TILE_N4 * 4}
+
+$if IN_STORAGE == "buffer":
+  #define INPUT_BUFFER
+$elif IN_STORAGE == "texture3d":
+  #define INPUT_TEXTURE3D
+
+${define_required_extensions("texture3d", DTYPE)}
+$if IN_STORAGE == "buffer":
+  ${define_required_extensions("buffer", DTYPE)}
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+$if IN_STORAGE == "buffer":
+  ${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer", is_scalar_array=False)}
+$else:
+  ${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "texture2d")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture2d")}
+
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+
+// Push constants are uploaded in 16-byte chunks (one ivec4 each).
+layout(push_constant) uniform restrict Block {
+  ivec4 gemm_dims;   // (K_total, K4_total, M, _unused)
+  vec4  clamp_vals;  // (out_min, out_max, _unused, _unused)
+};
+
+#define K_TOTAL  gemm_dims.x
+#define K4_TOTAL gemm_dims.y
+#define M_TOTAL  gemm_dims.z
+#define OUT_MIN  clamp_vals.x
+#define OUT_MAX  clamp_vals.y
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "activation_type", "0")}
+
+#include "linear_fp_input_tile.glslh"
+#include "linear_fp_packed_weight_tile_load.glslh"
+#include "linear_fp_output_tile_fp_compute.glslh"
+
+/*
+ * Load TILE_M rows × TILE_K4 K-tiles of the im2col'd input.
+ * The im2col output is a contiguous (M, K_total/4) matrix of vec4s, so the
+ * load is a plain 2D fetch — no spatial decomposition.
+ */
+void load_input_tile_with_checks(
+    out FPInputTile tile,
+    const int k4_start,
+    const int m_start,
+    const int K4,
+    const int M,
+    const int W_out) {
+  // W_out is only consumed by the texture3d variant below.
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      if (k4_start + k4 < K4 && m_start + m < M) {
+        const int row = m_start + m;
+        const int col = k4_start + k4;
+#if defined(INPUT_BUFFER)
+        // Cast SSBO texel into the input tile type (f16vec4 for half, vec4 for
+        // float).
+        tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(t_in[row * K4 + col]);
+#elif defined(INPUT_TEXTURE3D)
+        // texture3d layout: row (the flat M index) decomposes into (ow, oh)
+        // and K4 is along the Z axis. texelFetch returns vec4 (fp32); cast to
+        // the input tile type.
+        tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(
+            texelFetch(t_in, ivec3(row % W_out, row / W_out, col), 0));
+#else
+        tile.data[m][k4] =
+            LINEAR_FP_INPUT_TILE_VEC4_T(texelFetch(t_in, ivec2(col, row), 0));
+#endif
+      } else {
+        tile.data[m][k4] = LINEAR_FP_INPUT_TILE_VEC4_T(0.0);
+      }
+    }
+  }
+}
+
+void store_output_tile_with_checks(
+    const FPOutTile out_tile,
+    const int n4_start,
+    const int m_start,
+    const int N4,
+    const int M,
+    const int W_out) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      if (m_start + m < M && n4_start + n4 < N4) {
+        const int spatial = m_start + m;
+        // Cast the accumulator (f16vec4 for the buffer/half path) to the
+        // texture3d output surface type for the activation clamp and store.
+        OUT_VEC4_T texel = OUT_VEC4_T(out_tile.data[m][n4]);
+        if (activation_type == 1) {
+          texel = max(texel, OUT_VEC4_T(0.0));
+        } else if (activation_type == 2) {
+          texel = clamp(texel, OUT_VEC4_T(OUT_MIN), OUT_VEC4_T(OUT_MAX));
+        }
+        imageStore(
+            t_out, ivec3(spatial % W_out, spatial / W_out, n4_start + n4), texel);
+      }
+    }
+  }
+}
+
+void main() {
+  const int tile_idx_n = int(gl_GlobalInvocationID.x);
+  const int tile_idx_m = int(gl_GlobalInvocationID.y);
+
+  const int n4_start = tile_idx_n * TILE_N4;
+  const int m_start = tile_idx_m * TILE_M;
+
+  const int W_out = out_sizes.x;
+  const int H_out = out_sizes.y;
+  const int M = M_TOTAL;
+  const int K4 = K4_TOTAL;
+  const int N = out_sizes.z;
+  const int N4 = div_up_4(N);
+
+  if (n4_start >= N4 || m_start >= M) {
+    return;
+  }
+
+  FPOutTile out_tile;
+  initialize(out_tile);
+
+  FPInputTile in_tile;
+  FPWeightTile w_tile;
+
+  for (int k4 = 0; k4 < K4; k4 += TILE_K4) {
+    load_input_tile_with_checks(in_tile, k4, m_start, K4, M, W_out);
+    load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4);
+    fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile);
+  }
+
+  // Apply bias. The bias texel depends only on n4, so fetch it once per n4 and
+  // add it to every m row rather than re-fetching inside the M loop.
+  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+    if (n4_start + n4 < N4) {
+      // t_bias is an fp32 texture2d; cast its texel to the accumulator type.
+      const LINEAR_FP_OUTPUT_TILE_VEC4_T bias_texel =
+          LINEAR_FP_OUTPUT_TILE_VEC4_T(
+              texelFetch(t_bias, ivec2(n4_start + n4, 0), 0));
+      [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+        out_tile.data[m][n4] += bias_texel;
+      }
+    }
+  }
+
+  store_output_tile_with_checks(out_tile, n4_start, m_start, N4, M, W_out);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml
new file mode 100644
index 00000000000..15ec490b130
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_gemm.yaml
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_gemm:
+  parameter_names_with_default_values:
+    DTYPE: float
+    IN_STORAGE: texture2d
+    TILE_M4: 1
+    TILE_K4: 1
+    TILE_N4: 1
+    TILE_M: 4
+  generate_variant_forall:
+    combination:
+      parameter_names: [IN_STORAGE, DTYPE]
+      combos:
+        - parameter_values: [texture2d, float]
+        - parameter_values: [texture2d, half]
+        - parameter_values: [texture3d, float]
+        - parameter_values: [texture3d, half]
+        - parameter_values: [buffer, float]
+        - parameter_values: [buffer, half]
+  shader_variants:
+    - NAME: conv2d_gemm
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl
new file mode 100644
index 00000000000..20d07e3d1f8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.glsl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Im2col transformation for FP32 / FP16 conv2d.
+ *
+ * The output is a 2D matrix of shape [M, K_total] where
+ *   M       = H_out * W_out                 (number of output spatial positions)
+ *   K_total = Kh * Kw * align_up_4(C_in)    (flattened receptive field)
+ *
+ * K layout (so a 4-tile in K — one vec4 — holds the same kernel position):
+ *   K = (ki * Kw + kj) * Cin_padded + ci
+ *
+ * Three codegen'd storage variants of the output tensor:
+ *   - texture2d, width-packed: texel at (k4, m) holds 4 K values for spatial
+ *     position m.  Extents = (K_total/4, M).
+ *   - texture3d, channels-packed: texel at (ow, oh, k4) holds 4 K values
+ *     for output spatial position (oh, ow).  Extents = (W_out, H_out, K4).
+ *     Used as a fallback when M would exceed max_texture2d_dim.
+ *   - buffer: vec4 at offset (m * K4 + k4), same K packing.
+ *
+ * The caller picks storage per device (Mali → buffer; others → texture2d
+ * when its 2D extents fit, texture3d when its 3D extents fit, else buffer).
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+
+$if OUT_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+  #define VEC4_BUF_T ${texel_load_type(DTYPE, "buffer")}
+$elif OUT_STORAGE == "texture3d":
+  #define OUTPUT_TEXTURE3D
+
+${define_required_extensions("texture3d", DTYPE)}
+$if OUT_STORAGE == "buffer":
+  ${define_required_extensions("buffer", DTYPE)}
+
+layout(std430) buffer;
+
+$if OUT_STORAGE == "buffer":
+  ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer", is_scalar_array=False)}
+$else:
+  ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+
+// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply
+// with the per-entry size limit.
+layout(push_constant) uniform restrict Block {
+  ivec4 kernel_stride;  // (Kh, Kw, Sh, Sw)
+  ivec4 padding_dil;    // (Ph, Pw, Dh, Dw)
+  ivec4 dims;           // (Cin_padded, W_out, H_out, K4_total)
+};
+
+#define KERNEL_H   kernel_stride.x
+#define KERNEL_W   kernel_stride.y
+#define STRIDE_H   kernel_stride.z
+#define STRIDE_W   kernel_stride.w
+#define PADDING_H  padding_dil.x
+#define PADDING_W  padding_dil.y
+#define DILATION_H padding_dil.z
+#define DILATION_W padding_dil.w
+#define CIN_PADDED dims.x
+#define W_OUT      dims.y
+#define H_OUT      dims.z
+#define K4_TOTAL   dims.w
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int k4 = int(gl_GlobalInvocationID.x);
+  const int m  = int(gl_GlobalInvocationID.y);
+  const int M  = H_OUT * W_OUT;
+
+  if (k4 >= K4_TOTAL || m >= M) {
+    return;
+  }
+
+  const int k_start = k4 * 4;
+
+  // K = (ki * Kw + kj) * Cin_padded + ci ; since Cin_padded % 4 == 0, all 4
+  // K values in this texel share the same (ki, kj) and span 4 consecutive
+  // ci values starting at ci_start.
+  const int krow_idx = k_start / CIN_PADDED; // ki * Kw + kj
+  const int ci_start = k_start % CIN_PADDED;
+  const int kj       = krow_idx % KERNEL_W;
+  const int ki       = krow_idx / KERNEL_W;
+  const int ci_blk   = ci_start >> 2;        // ci_start / 4
+
+  // Decompose flat output position m back into (oh, ow).
+  const int ow = m % W_OUT;
+  const int oh = m / W_OUT;
+
+  // Compute the input spatial position for this (oh, ow, ki, kj).
+  const int ih = oh * STRIDE_H - PADDING_H + ki * DILATION_H;
+  const int iw = ow * STRIDE_W - PADDING_W + kj * DILATION_W;
+
+  VEC4_T out_texel = VEC4_T(0);
+  if (ih >= 0 && ih < in_sizes.y && iw >= 0 && iw < in_sizes.x) {
+    out_texel = texelFetch(t_in, ivec3(iw, ih, ci_blk), 0);
+  }
+
+#if defined(OUTPUT_BUFFER)
+  t_out[m * K4_TOTAL + k4] = VEC4_BUF_T(out_texel);
+#elif defined(OUTPUT_TEXTURE3D)
+  imageStore(t_out, ivec3(ow, oh, k4), out_texel);
+#else
+  imageStore(t_out, ivec2(k4, m), out_texel);
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.yaml
new file mode 100644
index 00000000000..918d79298dd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_im2col.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_im2col:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUT_STORAGE: texture2d
+  generate_variant_forall:
+    combination:
+      parameter_names: [OUT_STORAGE, DTYPE]
+      combos:
+        - parameter_values: [texture2d, float]
+        - parameter_values: [texture2d, half]
+        - parameter_values: [texture3d, float]
+        - parameter_values: [texture3d, half]
+        - parameter_values: [buffer, float]
+        - parameter_values: [buffer, half]
+  shader_variants:
+    - NAME: conv2d_im2col
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.glsl
new file mode 100644
index 00000000000..77f34324b4f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.glsl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define BUF_T ${buffer_scalar_type(BUF_DTYPE)}
+#define VEC4_T ${texel_load_type(DTYPE, PACKED_STORAGE)}
+#define T ${texel_load_component_type(DTYPE, PACKED_STORAGE)}
+
+$if PACKED_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+
+#extension GL_EXT_control_flow_attributes : require
+
+${define_required_extensions("buffer", BUF_DTYPE)}
+$if PACKED_STORAGE != "buffer":
+  ${define_required_extensions(PACKED_STORAGE, DTYPE)}
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+$if PACKED_STORAGE == "buffer":
+  ${layout_declare_tensor(B, "w", "t_weight_packed", DTYPE, "buffer", is_scalar_array=False)}
+$else:
+  ${layout_declare_tensor(B, "w", "t_weight_packed", DTYPE, PACKED_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_src", BUF_DTYPE, "buffer", is_scalar_array=True)}
+
+// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply
+// with the per-entry size limit.
+layout(push_constant) uniform restrict Block {
+  ivec4 dims0; // (N=C_out, K=K_total, C_in, Cin_padded)
+  ivec4 dims1; // (K_h, K_w, _unused, _unused)
+};
+
+#define N          dims0.x
+#define K          dims0.y
+#define C_IN       dims0.z
+#define CIN_PADDED dims0.w
+#define K_H        dims1.x
+#define K_W        dims1.y
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Packs the ORIGINAL serialized conv2d weight [C_out, C_in, K_h, K_w]
+// (PyTorch row-major contiguous) directly into the 4OC x 4IC blocked layout
+// that conv2d_gemm.glsl loads via load_packed_weight_tile_with_checks, with no
+// CPU-side repack of the serialized data.
+//
+// The GEMM treats the weight as [N=C_out, K=K_total] with the im2col K-axis
+// layout
+//   k = (ki * K_w + kj) * Cin_padded + ci
+// so each 4-tile of K holds 4 consecutive ci for one (ki, kj). Lanes with
+// ci >= C_in are zero (Cin padding).
+//
+// This produces a byte-identical packed tensor to running the generic
+// pack_fp_linear_weight (is_transposed=1) over the CPU-flattened [C_out,
+// K_total] weight: a 4x4 block is transposed so packed[dk] = {w_flat[n4*4 +
+// 0..3][k4*4 + dk]}.
+
+// Read the flattened weight scalar at logical (n, k) directly from the
+// serialized [C_out, C_in, K_h, K_w] buffer, applying the im2col K decode and
+// Cin padding. Returns 0 for out-of-range n / padding ci lanes.
+T load_flat_weight_scalar(const int n, const int k) {
+  if (n >= N || k >= K) {
+    return T(0);
+  }
+  const int ci = k % CIN_PADDED;
+  if (ci >= C_IN) {
+    return T(0); // Cin padding lane
+  }
+  const int krow = k / CIN_PADDED; // ki * K_w + kj
+  const int kj = krow % K_W;
+  const int ki = krow / K_W;
+  // Serialized [C_out, C_in, K_h, K_w] contiguous index.
+  const int src_idx = ((n * C_IN + ci) * K_H + ki) * K_W + kj;
+  return T(t_weight_src[src_idx]);
+}
+
+VEC4_T load_flat_weight_row(const int n, const int k_base) {
+  return VEC4_T(
+      load_flat_weight_scalar(n, k_base),
+      load_flat_weight_scalar(n, k_base + 1),
+      load_flat_weight_scalar(n, k_base + 2),
+      load_flat_weight_scalar(n, k_base + 3));
+}
+
+void main() {
+  const int n4 = int(gl_GlobalInvocationID.x);
+  const int k4 = int(gl_GlobalInvocationID.y);
+
+  const int K4 = div_up_4(K);
+  const int N4 = div_up_4(N);
+
+  if (n4 >= N4 || k4 >= K4) {
+    return;
+  }
+
+  // Read 4 N-rows at the k4 column block, transpose into a 4OC x 4IC block.
+  // Mirrors the is_transposed branch of pack_fp_linear_weight.
+  VEC4_T src_rows[4];
+  [[unroll]] for (int dn = 0; dn < 4; dn++) {
+    src_rows[dn] = load_flat_weight_row(n4 * 4 + dn, k4 * 4);
+  }
+  [[unroll]] for (int dk = 0; dk < 4; dk++) {
+    VEC4_T out_val = VEC4_T(
+        src_rows[0][dk], src_rows[1][dk], src_rows[2][dk], src_rows[3][dk]);
+#ifdef OUTPUT_BUFFER
+    t_weight_packed[(k4 * N4 + n4) * 4 + dk] = out_val;
+#else
+    imageStore(t_weight_packed, ivec2(n4 * 4 + dk, k4), out_val);
+#endif
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.yaml
new file mode 100644
index 00000000000..42e0a8ab229
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_conv2d_gemm_weight.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pack_conv2d_gemm_weight:
+  parameter_names_with_default_values:
+    DTYPE: float
+    BUF_DTYPE: float
+    PACKED_STORAGE: texture2d
+  generate_variant_forall:
+    combination:
+      parameter_names: [PACKED_STORAGE, DTYPE, BUF_DTYPE]
+      combos:
+        - parameter_values: [texture2d, float, float]
+        - parameter_values: [texture2d, half, half]
+        - parameter_values: [texture2d, half, float]
+        - parameter_values: [buffer, float, float]
+        - parameter_values: [buffer, half, half]
+  shader_variants:
+    - NAME: pack_conv2d_gemm_weight
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.cpp
new file mode 100644
index 00000000000..352acbcfb50
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.cpp
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <optional>
+
+namespace vkcompute {
+
+namespace {
+
+//
+// Weight handling
+//
+
+// Prepack the ORIGINAL serialized conv2d weight [C_out, C_in, K_h, K_w]
+// directly on the GPU into the 4OC x 4IC blocked layout that conv2d_gemm.glsl
+// loads via load_packed_weight_tile_with_checks. The serialized weight data is
+// read as-is (never CPU-repacked); pack_conv2d_gemm_weight.glsl performs the
+// im2col K-axis reorder (k = (ki * K_w + kj) * Cin_padded + ci, ci-padding
+// lanes zeroed) and the 4x4 transpose in one pass.
+//
+// The packed output is byte-identical to the layout the generic
+// prepack_fp_linear_weight (is_transposed=1) produced over a CPU-flattened
+// [C_out, K_total] weight, so conv2d_gemm.glsl is unchanged.
+ValueRef prepack_conv2d_gemm_weight(
+    ComputeGraph& graph,
+    const ValueRef weight_data) {
+  const std::vector<int64_t> w_sizes = graph.sizes_of(weight_data);
+  VK_CHECK_COND(w_sizes.size() == 4);
+  const int64_t C_out = w_sizes[0];
+  const int64_t C_in = w_sizes[1];
+  const int64_t K_h = w_sizes[2];
+  const int64_t K_w = w_sizes[3];
+
+  const int64_t Cin_padded = utils::align_up_4(C_in);
+  const int64_t K_total = K_h * K_w * Cin_padded;
+
+  const int64_t N = C_out;
+  const int64_t K = K_total;
+  const int64_t N4 = utils::div_up(N, int64_t(4));
+  const int64_t K4 = utils::div_up(K, int64_t(4));
+
+  // Packed tensor: K4 rows, N4*4 vec4 elements per row (4OC x 4IC blocks).
+  // kWidthPacked packs 4 scalars per texel, so width = N4*4*4 scalars.
+  const int64_t output_height = K4;
+  const int64_t output_width = N4 * 4 * 4;
+
+  utils::StorageType weight_storage = utils::kTexture2D;
+  const uint32_t max_extent =
+      graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width / 4 > max_extent ||
+      utils::safe_downcast<uint32_t>(output_height) > max_extent) {
+    weight_storage = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      {output_height, output_width},
+      graph.dtype_of(weight_data),
+      weight_storage,
+      utils::kWidthPacked);
+
+  const utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(N4),
+      utils::safe_downcast<uint32_t>(K4),
+      1u};
+
+  // Push constants must be uploaded in <= 16-byte (one ivec4) chunks; the
+  // shader's Block reads them back as dims0 / dims1. Layout must match
+  // pack_conv2d_gemm_weight.glsl.
+  const utils::ivec4 dims0{
+      utils::safe_downcast<int32_t>(N),
+      utils::safe_downcast<int32_t>(K),
+      utils::safe_downcast<int32_t>(C_in),
+      utils::safe_downcast<int32_t>(Cin_padded)};
+  const utils::ivec4 dims1{
+      utils::safe_downcast<int32_t>(K_h),
+      utils::safe_downcast<int32_t>(K_w),
+      0,
+      0};
+
+  std::string kernel_name = "pack_conv2d_gemm_weight";
+  add_storage_type_suffix(kernel_name, weight_storage);
+  add_dtype_suffix(kernel_name, graph.dtype_of(weight_data));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(weight_data));
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      weight_data,
+      packed_weight,
+      {},
+      {},
+      {PushConstantDataInfo(&dims0, sizeof(dims0)),
+       PushConstantDataInfo(&dims1, sizeof(dims1))}));
+
+  return packed_weight;
+}
+
+//
+// GEMM dispatch
+//
+
+vkapi::ShaderInfo pick_conv2d_gemm_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  // The im2col tensor's storage selects the input-load codegen variant of
+  // conv2d_gemm: texture2d vs buffer.
+  const ValueRef im2col_in = args.at(1).refs.at(0);
+
+  std::string kernel_name = "conv2d_gemm";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(im2col_in));
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 pick_conv2d_gemm_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const uint32_t W = graph->size_at<uint32_t>(-1, out);
+  const uint32_t H = graph->size_at<uint32_t>(-2, out);
+  const uint32_t C_out = graph->size_at<uint32_t>(-3, out);
+  const uint32_t M = H * W;
+  const uint32_t N4 = utils::div_up_4(C_out);
+  // TILE_N4=1, TILE_M=4
+  return {N4, utils::div_up(M, 4u), 1};
+}
+
+// Output sizes are determined by the conv shape (im2col tensor's spatial
+// extents match the conv output), so the GEMM shader doesn't need to resize
+// the output tensor — it's already set by the caller.  We still need a noop
+// resize because the dispatch infra expects one.
+void resize_conv2d_gemm_node(
+    ComputeGraph* /*graph*/,
+    const std::vector<ArgGroup>& /*args*/,
+    const std::vector<ValueRef>& /*extra_args*/) {
+  // no-op
+}
+
+void add_conv2d_gemm_node(
+    ComputeGraph& graph,
+    const ValueRef im2col_in,
+    const ValueRef packed_weight,
+    const ValueRef packed_bias,
+    const ValueRef out,
+    const int32_t K_total,
+    const int32_t M_total,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  const int32_t K4_total = K_total / 4;
+
+  const utils::ivec4 gemm_dims{K_total, K4_total, M_total, 0};
+  const utils::vec4 clamp_vals{out_min_val, out_max_val, 0.0f, 0.0f};
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      pick_conv2d_gemm_shader,
+      pick_conv2d_gemm_global_wg_size,
+      pick_hw_square_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite},
+       {{im2col_in, packed_weight, packed_bias}, vkapi::kRead}},
+      // Shader params buffers
+      {graph.sizes_ubo(out)},
+      // Push constants (2 × 16 bytes)
+      {PushConstantDataInfo(&gemm_dims, sizeof(gemm_dims)),
+       PushConstantDataInfo(&clamp_vals, sizeof(clamp_vals))},
+      // Specialization constants
+      // activation_type: 0=none, 1=relu, 2=clamp
+      {clamp_out ? 2 : 0},
+      // Resize args
+      {},
+      // Resizing logic
+      resize_conv2d_gemm_node));
+}
+
+} // namespace
+
+//
+// Orchestration
+//
+
+void conv2d_gemm_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val,
+    const std::optional<utils::StorageType> im2col_storage_override) {
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  const std::vector<int64_t> w_sizes = graph.sizes_of(weight_data);
+  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
+  VK_CHECK_COND(in_sizes.size() == 4 && in_sizes[0] == 1);
+  VK_CHECK_COND(w_sizes.size() == 4);
+
+  const int64_t C_in = w_sizes[1];
+  const int64_t K_h = w_sizes[2];
+  const int64_t K_w = w_sizes[3];
+  const int64_t H_out = out_sizes[2];
+  const int64_t W_out = out_sizes[3];
+
+  const int64_t Cin_padded = utils::align_up_4(C_in);
+  const int64_t K_total = K_h * K_w * Cin_padded;
+  // Cin_padded is align_up_4(C_in), so K_total is a multiple of 4 and the
+  // K4_total = K_total / 4 division below is exact.
+  VK_CHECK_COND(K_total % 4 == 0);
+
+  // Extract scalar conv params, scoping the IntListPtrs so they don't keep
+  // active value pointers around while we mutate the graph below.
+  int32_t stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w;
+  {
+    const auto stride_list = graph.get_int_list(stride);
+    const auto padding_list = graph.get_int_list(padding);
+    const auto dilation_list = graph.get_int_list(dilation);
+    stride_h = utils::safe_downcast<int32_t>(stride_list->at(0));
+    stride_w = utils::safe_downcast<int32_t>(stride_list->at(1));
+    padding_h = utils::safe_downcast<int32_t>(padding_list->at(0));
+    padding_w = utils::safe_downcast<int32_t>(padding_list->at(1));
+    dilation_h = utils::safe_downcast<int32_t>(dilation_list->at(0));
+    dilation_w = utils::safe_downcast<int32_t>(dilation_list->at(1));
+  }
+
+  const int64_t M = H_out * W_out;
+  const int64_t K4_total = K_total / 4;
+
+  // Pick im2col storage. When an explicit override is provided (test-only),
+  // honor it and skip auto-selection. Otherwise run the production
+  // auto-selection per device:
+  //   - Mali: always buffer (texture sampling on Mali is comparatively slow).
+  //   - Others: prefer texture2d (M × K4_total). If that doesn't fit the
+  //     device's max texture2d dim, fall back to texture3d laid out as
+  //     (W_out, H_out, K4_total). Buffer is the last-resort fallback.
+  utils::StorageType im2col_storage;
+  if (im2col_storage_override.has_value()) {
+    im2col_storage = im2col_storage_override.value();
+    VK_CHECK_COND(
+        im2col_storage == utils::kBuffer ||
+        im2col_storage == utils::kTexture2D ||
+        im2col_storage == utils::kTexture3D);
+  } else if (graph.device_is_mali()) {
+    im2col_storage = utils::kBuffer;
+  } else {
+    const uint32_t max_2d = graph.context()->adapter_ptr()->max_texture2d_dim();
+    const uint32_t max_3d = graph.context()->adapter_ptr()->max_texture3d_dim();
+    const bool fits_2d = utils::safe_downcast<uint32_t>(K4_total) <= max_2d &&
+        utils::safe_downcast<uint32_t>(M) <= max_2d;
+    const bool fits_3d = utils::safe_downcast<uint32_t>(W_out) <= max_3d &&
+        utils::safe_downcast<uint32_t>(H_out) <= max_3d &&
+        utils::safe_downcast<uint32_t>(K4_total) <= max_3d;
+    if (fits_2d) {
+      im2col_storage = utils::kTexture2D;
+    } else if (fits_3d) {
+      im2col_storage = utils::kTexture3D;
+    } else {
+      im2col_storage = utils::kBuffer;
+    }
+  }
+
+  // Allocate the im2col intermediate as a scoped scratch tensor. The im2col
+  // value is produced by the im2col node and consumed immediately by the GEMM
+  // node, both below, and is dead afterwards. Using a TmpTensor lets the memory
+  // planner alias one backing buffer across the (non-overlapping) im2col
+  // lifetimes of every conv2d layer, so peak memory tracks the largest single
+  // im2col rather than the sum of all of them. The TmpTensor must outlive
+  // add_conv2d_gemm_node (its last consumer), so it lives to the end of this
+  // function.
+  //
+  // The 2D and buffer variants use a flat [M, K_total] kWidthPacked shape; the
+  // texture3d variant uses the natural [1, K_total, H_out, W_out]
+  // kChannelsPacked shape so K4 lays along Z. Hoist the per-storage differences
+  // into locals so the TmpTensor is constructed exactly once and never needs to
+  // be copied or moved.
+  std::vector<int64_t> im2col_sizes;
+  utils::StorageType im2col_tmp_storage;
+  utils::GPUMemoryLayout im2col_layout;
+  if (im2col_storage == utils::kTexture3D) {
+    im2col_sizes = {1, K_total, H_out, W_out};
+    im2col_tmp_storage = utils::kTexture3D;
+    im2col_layout = utils::kChannelsPacked;
+  } else {
+    im2col_sizes = {M, K_total};
+    im2col_tmp_storage = im2col_storage;
+    im2col_layout = utils::kWidthPacked;
+  }
+  TmpTensor im2col_tmp(
+      &graph,
+      im2col_sizes,
+      graph.dtype_of(in),
+      im2col_tmp_storage,
+      im2col_layout);
+  const ValueRef im2col_tensor = im2col_tmp.vref;
+
+  // Step 1: im2col
+  add_conv2d_im2col_node(
+      graph,
+      in,
+      im2col_tensor,
+      utils::safe_downcast<int32_t>(K_h),
+      utils::safe_downcast<int32_t>(K_w),
+      stride_h,
+      stride_w,
+      padding_h,
+      padding_w,
+      dilation_h,
+      dilation_w,
+      utils::safe_downcast<int32_t>(Cin_padded),
+      utils::safe_downcast<int32_t>(H_out),
+      utils::safe_downcast<int32_t>(W_out));
+
+  // Step 2: prepack weight for the GEMM directly from the serialized
+  // [C_out, C_in, K_h, K_w] weight on the GPU. The serialized data is read
+  // as-is (never CPU-repacked); the prepack shader does the im2col K-axis
+  // reorder + 4x4 transpose into the layout conv2d_gemm.glsl loads via
+  // load_packed_weight_tile_with_checks.
+  ValueRef packed_weight = prepack_conv2d_gemm_weight(graph, weight_data);
+
+  // Bias prepack: matches the bias format conv2d_gemm expects. prepack_biases
+  // only reads dim 0 (= C_out) of the weight, so the original 4D weight works
+  // directly.
+  ValueRef packed_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /*transposed=*/false,
+      utils::kTexture2D,
+      utils::kWidthPacked);
+
+  check_conv_args(graph, in, out);
+
+  // Step 3: GEMM
+  add_conv2d_gemm_node(
+      graph,
+      im2col_tensor,
+      packed_weight,
+      packed_bias,
+      out,
+      utils::safe_downcast<int32_t>(K_total),
+      utils::safe_downcast<int32_t>(M),
+      clamp_out,
+      out_min_val,
+      out_max_val);
+}
+
+//
+// Op registration — matches aten.convolution.default's 10-arg signature:
+//   in, weight, bias, stride, padding, dilation, transposed,
+//   output_padding, groups, out
+//
+// Only the conv2d non-transposed, groups=1 case is supported.
+
+void conv2d_gemm_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  VK_CHECK_COND(args.size() == 10);
+  const ValueRef in = args[0];
+  const ValueRef weight = args[1];
+  const ValueRef bias = args[2];
+  const ValueRef stride = args[3];
+  const ValueRef padding = args[4];
+  const ValueRef dilation = args[5];
+  const ValueRef transposed = args[6];
+  const ValueRef /*output_padding*/ _output_padding = args[7];
+  (void)_output_padding;
+  const ValueRef groups = args[8];
+  const ValueRef out = args[9];
+
+  VK_CHECK_COND(graph.get_bool(transposed) == false);
+  VK_CHECK_COND(graph.get_int(groups) == 1);
+
+  conv2d_gemm_impl(
+      graph,
+      in,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      out,
+      /*clamp_out=*/false,
+      /*out_min_val=*/0.0f,
+      /*out_max_val=*/0.0f);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.conv2d_gemm.default, conv2d_gemm_op);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h
new file mode 100644
index 00000000000..73e95887266
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <optional>
+
+namespace vkcompute {
+
+/*
+ * End-to-end orchestration for an FP32 / FP16 conv2d computed as
+ * im2col -> GEMM.  The dataflow is:
+ *
+ *   t_in [1, C_in, H_in, W_in]
+ *     │
+ *     ▼   (add_conv2d_im2col_node, conv2d_im2col.glsl)
+ *   im2col [1, K_total, H_out, W_out]      K_total = Kh * Kw * align_up_4(C_in)
+ *     │
+ *     │ + flattened weight [C_out, K_total]   (built CPU-side from
+ *     │   [C_out, C_in, Kh, Kw] with the ci ↔ (ki, kj) transpose)
+ *     ▼   (add_conv2d_gemm_node, conv2d_gemm.glsl)
+ *   t_out [1, C_out, H_out, W_out]
+ *
+ * This function performs both dispatch and prepack registration. The im2col
+ * intermediate is allocated as a graph tensor; the flattened weight is
+ * registered as a new TensorRef owned by the graph.
+ *
+ * Constraints (asserted internally):
+ *   - input batch == 1
+ *   - weight rank == 4
+ *   - groups == 1 (general grouped conv not yet supported)
+ *   - transposed == false
+ *
+ * `im2col_storage_override` controls the storage type of the im2col
+ * intermediate tensor (and, by extension, the conv2d_gemm input-load variant):
+ *   - std::nullopt (default): the production path. Storage is auto-selected
+ *     from device characteristics and texture-extent limits — byte-for-byte
+ *     the same selection used by the registered op.
+ *   - a concrete StorageType: force that storage, skipping auto-selection.
+ *     Used by tests to exercise each storage variant deterministically and
+ *     independently of the device. Must be one of kBuffer / kTexture2D /
+ *     kTexture3D.
+ */
+void conv2d_gemm_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out = false,
+    const float out_min_val = 0.0f,
+    const float out_max_val = 0.0f,
+    const std::optional<utils::StorageType> im2col_storage_override =
+        std::nullopt);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.cpp
new file mode 100644
index 00000000000..150275b5ac4
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+// Push constants are uploaded in 16-byte chunks (one ivec4 each) to comply
+// with the per-entry size limit. Layout matches conv2d_im2col.glsl:
+//   { ivec4 kernel_stride, ivec4 padding_dil, ivec4 dims }
+
+void add_conv2d_im2col_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef im2col_out,
+    const int32_t kernel_h,
+    const int32_t kernel_w,
+    const int32_t stride_h,
+    const int32_t stride_w,
+    const int32_t padding_h,
+    const int32_t padding_w,
+    const int32_t dilation_h,
+    const int32_t dilation_w,
+    const int32_t Cin_padded,
+    const int32_t H_out,
+    const int32_t W_out) {
+  const utils::StorageType out_storage = graph.storage_type_of(im2col_out);
+  VK_CHECK_COND(
+      out_storage == utils::kBuffer || out_storage == utils::kTexture2D ||
+      out_storage == utils::kTexture3D);
+
+  std::string kernel_name = "conv2d_im2col";
+  add_storage_type_suffix(kernel_name, out_storage);
+  add_dtype_suffix(kernel_name, graph.dtype_of(im2col_out));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  const int32_t M = H_out * W_out;
+  // K_total is laid out so that 4-tiles share a kernel position; since
+  // Cin_padded is a multiple of 4, K_total is also a multiple of 4.
+  const int32_t K_total = kernel_h * kernel_w * Cin_padded;
+  const int32_t K4_total = K_total / 4;
+
+  const utils::ivec4 kernel_stride{kernel_h, kernel_w, stride_h, stride_w};
+  const utils::ivec4 padding_dil{padding_h, padding_w, dilation_h, dilation_w};
+  const utils::ivec4 dims{Cin_padded, W_out, H_out, K4_total};
+
+  // Global wg: one thread per (k4, m) vec4 in the output.
+  const utils::uvec3 global_wg_size{
+      utils::safe_downcast<uint32_t>(K4_total),
+      utils::safe_downcast<uint32_t>(M),
+      1u};
+  const utils::uvec3 local_wg_size{16u, 4u, 1u};
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{im2col_out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // UBOs
+      {graph.sizes_ubo(in)},
+      // Push constants (3 × ivec4 = 48 bytes, split per 16-byte limit)
+      {PushConstantDataInfo(&kernel_stride, sizeof(kernel_stride)),
+       PushConstantDataInfo(&padding_dil, sizeof(padding_dil)),
+       PushConstantDataInfo(&dims, sizeof(dims))},
+      // Specialization constants
+      {},
+      // Resize args
+      {},
+      // Resizing logic
+      nullptr));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h
new file mode 100644
index 00000000000..8821db181bd
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dIm2Col.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+/*
+ * Dispatch a single im2col transformation node for an FP32 / FP16 conv2d.
+ *
+ * Produces a 2D tensor of logical shape
+ *   [M, K_total]
+ * where
+ *   M       = H_out * W_out
+ *   K_total = kernel_h * kernel_w * align_up_4(C_in)
+ *
+ * The K dimension is laid out so that consecutive 4-tiles of K hold 4
+ * consecutive ci values for the same (ki, kj) kernel position. This is the
+ * layout `conv2d_gemm` consumes for the GEMM step.
+ *
+ * The im2col output tensor's storage type (texture2d width-packed or
+ * buffer) is determined by the caller; this function picks the matching
+ * shader variant based on `graph.storage_type_of(im2col_out)`.
+ *
+ * Inputs:
+ *   in          : input texture3D channels-packed [1, C_in, H_in, W_in]
+ *   im2col_out  : output 2D tensor [M, K_total] (caller allocates),
+ *                 storage = texture2d (kWidthPacked) or buffer
+ *   kernel_h/w  : conv kernel dimensions
+ *   stride_*    : conv strides
+ *   padding_*   : conv paddings
+ *   dilation_*  : conv dilations
+ *   Cin_padded  : align_up_4(C_in)
+ *   H_out, W_out: output spatial extents
+ */
+void add_conv2d_im2col_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef im2col_out,
+    const int32_t kernel_h,
+    const int32_t kernel_w,
+    const int32_t stride_h,
+    const int32_t stride_w,
+    const int32_t padding_h,
+    const int32_t padding_w,
+    const int32_t dilation_h,
+    const int32_t dilation_w,
+    const int32_t Cin_padded,
+    const int32_t H_out,
+    const int32_t W_out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2d.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2d.cpp
new file mode 100644
index 00000000000..8949276740c
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2d.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Conv2dGemm.h>
+
+#include <optional>
+
+namespace vkcompute {
+
+void test_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[0]  = input  [N, C_in, H, W]
+  // args[1]  = weight [C_out, C_in, K_h, K_w] (constant)
+  // args[2]  = bias   (constant, or none)
+  // args[3]  = stride_h    (int)
+  // args[4]  = stride_w    (int)
+  // args[5]  = padding_h   (int)
+  // args[6]  = padding_w   (int)
+  // args[7]  = dilation_h  (int)
+  // args[8]  = dilation_w  (int)
+  // args[9]  = impl_selector (string)
+  // args[10] = output [N, C_out, H_out, W_out]
+  //
+  // impl_selector grammar:
+  //   ""             -> aten.convolution.default (direct sliding-window)
+  //   "im2col"       -> et_vk.conv2d_gemm.default, auto im2col storage
+  //   "im2col_buffer"-> im2col/GEMM, force buffer im2col intermediate
+  //   "im2col_tex2d" -> im2col/GEMM, force texture2d im2col intermediate
+  //   "im2col_tex3d" -> im2col/GEMM, force texture3d im2col intermediate
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const int64_t stride_h = graph.extract_scalar<int64_t>(args.at(3));
+  const int64_t stride_w = graph.extract_scalar<int64_t>(args.at(4));
+  const int64_t padding_h = graph.extract_scalar<int64_t>(args.at(5));
+  const int64_t padding_w = graph.extract_scalar<int64_t>(args.at(6));
+  const int64_t dilation_h = graph.extract_scalar<int64_t>(args.at(7));
+  const int64_t dilation_w = graph.extract_scalar<int64_t>(args.at(8));
+  const std::string impl_selector = graph.extract_string(args.at(9));
+  const ValueRef out = args.at(10);
+
+  ValueRef stride =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{stride_h, stride_w});
+  ValueRef padding = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{padding_h, padding_w});
+  ValueRef dilation = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{dilation_h, dilation_w});
+
+  // The forced-storage variants must reach conv2d_gemm_impl with the override,
+  // which the registered op (et_vk.conv2d_gemm.default) cannot express since it
+  // always auto-selects. Route those directly to conv2d_gemm_impl; the auto
+  // ("im2col") and direct ("") paths stay on the registered-op dispatch.
+  std::optional<utils::StorageType> im2col_storage_override;
+  if (impl_selector == "im2col_buffer") {
+    im2col_storage_override = utils::kBuffer;
+  } else if (impl_selector == "im2col_tex2d") {
+    im2col_storage_override = utils::kTexture2D;
+  } else if (impl_selector == "im2col_tex3d") {
+    im2col_storage_override = utils::kTexture3D;
+  }
+
+  if (im2col_storage_override.has_value()) {
+    conv2d_gemm_impl(
+        graph,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        /*clamp_out=*/false,
+        /*out_min_val=*/0.0f,
+        /*out_max_val=*/0.0f,
+        im2col_storage_override);
+    return;
+  }
+
+  ValueRef transposed = graph.add_scalar<bool>(false);
+  ValueRef output_padding =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+  ValueRef groups = graph.add_scalar<int64_t>(1);
+
+  const std::string target_op = (impl_selector == "im2col")
+      ? "et_vk.conv2d_gemm.default"
+      : "aten.convolution.default";
+
+  VK_GET_OP_FN(target_op.c_str())
+  (graph,
+   {input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    out});
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv2d.default, test_conv2d);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index f9501eeb424..7ff7b6ec426 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -101,6 +101,7 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_q8ta_pixel_shuffle")
     define_custom_op_test_binary("test_q8ta_unary")
     define_custom_op_test_binary("test_mm")
+    define_custom_op_test_binary("test_conv2d")
     define_custom_op_test_binary("test_conv2d_pw")
     define_custom_op_test_binary("test_conv2d_dw")
     define_custom_op_test_binary("test_embedding_q4gsw")
diff --git a/backends/vulkan/test/custom_ops/test_conv2d.cpp b/backends/vulkan/test/custom_ops/test_conv2d.cpp
new file mode 100644
index 00000000000..f56a2d81407
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv2d.cpp
@@ -0,0 +1,576 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "conv2d_utils.h"
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 64;
+
+struct InputDims {
+  int64_t N;
+  int64_t C;
+  int64_t H;
+  int64_t W;
+
+  InputDims(int64_t n, int64_t c, int64_t h, int64_t w)
+      : N(n), C(c), H(h), W(w) {}
+};
+
+struct Conv2dTestConfig {
+  InputDims dims;
+  int64_t C_out;
+  KernelSize kernel;
+  Stride stride;
+  Padding padding;
+  Dilation dilation;
+  bool has_bias;
+};
+
+static int64_t calc_out_size(
+    int64_t in_size,
+    int64_t kernel_size,
+    int64_t stride,
+    int64_t padding,
+    int64_t dilation) {
+  return (in_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride +
+      1;
+}
+
+// Shared perf/skip classification used by both create_conv2d_test_case (to tag
+// PERF vs ACCU) and conv2d_reference_impl (to gate the large-K FP16 reference
+// check). A shape is "perf" if any dimension reaches kRefDimSizeLimit; the
+// boundary is inclusive (>=) so a 64-wide dim counts as perf — FP16
+// accumulation error at K = K_h * K_w * C_in for such shapes can exceed the
+// half tolerance and false-fail. Keep both call sites on this single helper to
+// avoid the two predicates drifting apart.
+static bool
+conv2d_is_perf_shape(int64_t C_in, int64_t C_out, int64_t H, int64_t W) {
+  return C_in >= kRefDimSizeLimit || C_out >= kRefDimSizeLimit ||
+      H >= kRefDimSizeLimit || W >= kRefDimSizeLimit;
+}
+
+static TestCase create_conv2d_test_case(
+    const Conv2dTestConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout,
+    const std::string& impl_selector = "") {
+  TestCase test_case;
+
+  bool is_perf = conv2d_is_perf_shape(
+      config.dims.C, config.C_out, config.dims.H, config.dims.W);
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = repr_str(storage_type, memory_layout);
+  std::string dtype_str = dtype_short(dtype);
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  int64_t H_out = calc_out_size(
+      config.dims.H,
+      config.kernel.h,
+      config.stride.h,
+      config.padding.h,
+      config.dilation.h);
+  int64_t W_out = calc_out_size(
+      config.dims.W,
+      config.kernel.w,
+      config.stride.w,
+      config.padding.w,
+      config.dilation.w);
+
+  std::string shape = "[" + std::to_string(config.dims.N) + "," +
+      std::to_string(config.dims.C) + "," + std::to_string(config.dims.H) +
+      "," + std::to_string(config.dims.W) + "]->[" +
+      std::to_string(config.C_out) + "] k" + std::to_string(config.kernel.h) +
+      "x" + std::to_string(config.kernel.w) + " s" +
+      std::to_string(config.stride.h) + " p" +
+      std::to_string(config.padding.h) + " d" +
+      std::to_string(config.dilation.h);
+
+  std::string suffix = bias_str;
+  if (!impl_selector.empty()) {
+    if (!suffix.empty()) {
+      suffix += " ";
+    }
+    suffix += "[" + impl_selector + "]";
+  }
+
+  std::string name =
+      make_test_label(prefix, dtype_str, dtype_str, shape, storage_str, suffix);
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv2d.default");
+
+  // Input tensor [N, C_in, H, W]
+  ValueSpec input(
+      {config.dims.N, config.dims.C, config.dims.H, config.dims.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+
+  // Weight tensor [C_out, C_in, K_h, K_w] - constant
+  ValueSpec weight(
+      {config.C_out, config.dims.C, config.kernel.h, config.kernel.w},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+
+  test_case.add_input_spec(input);
+  test_case.add_input_spec(weight);
+
+  // Bias (or none)
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.C_out},
+        dtype,
+        storage_type,
+        memory_layout,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.w)));
+
+  // impl_selector string
+  test_case.add_input_spec(ValueSpec::make_string(impl_selector));
+
+  // Output tensor [N, C_out, H_out, W_out]
+  ValueSpec output(
+      {config.dims.N, config.C_out, H_out, W_out},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+// Reference implementation for general conv2d (groups=1).
+//
+// Supports both FP32 and (small-shape) FP16 inputs. The math is always done in
+// float; for FP16 the master input/weight/bias values are dequantized from
+// their half storage via get_element(), and the resulting float reference is
+// compared against the dequantized GPU output by validate_against_reference().
+//
+// FP16 accumulation error grows with K (= K_h * K_w * C_in). For large-K PERF
+// shapes the FP32 reference would diverge from the GPU's FP16 accumulation
+// enough to trip even the relaxed half tolerance, producing false failures, so
+// those are intentionally left timing-only: this function throws
+// std::invalid_argument, which execute_test_cases() catches to skip the
+// correctness check (ref_computed stays false) while still benchmarking.
+static void conv2d_reference_impl(TestCase& test_case) {
+  const ValueSpec& input = test_case.inputs()[0];
+  const ValueSpec& weight = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input.dtype != vkapi::kFloat && input.dtype != vkapi::kHalf) {
+    throw std::invalid_argument("Reference only supports float and half");
+  }
+
+  auto input_sizes = input.get_tensor_sizes();
+  auto weight_sizes = weight.get_tensor_sizes();
+  auto output_sizes = output.get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H_in = input_sizes[2];
+  int64_t W_in = input_sizes[3];
+  int64_t C_out = weight_sizes[0];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  // For FP16, only compute a reference for small (ACCU) shapes where K is small
+  // enough that FP32-vs-FP16 accumulation error stays within the half
+  // tolerance. Large-K PERF half shapes stay timing-only via the throw below.
+  // The predicate mirrors create_conv2d_test_case's is_perf classification.
+  if (input.dtype == vkapi::kHalf) {
+    const bool is_perf = conv2d_is_perf_shape(C_in, C_out, H_in, W_in);
+    if (is_perf) {
+      throw std::invalid_argument(
+          "Half reference skipped for large-K PERF shape (timing-only)");
+    }
+  }
+
+  int64_t stride_h = test_case.inputs()[3].get_int_value();
+  int64_t stride_w = test_case.inputs()[4].get_int_value();
+  int64_t padding_h = test_case.inputs()[5].get_int_value();
+  int64_t padding_w = test_case.inputs()[6].get_int_value();
+  int64_t dilation_h = test_case.inputs()[7].get_int_value();
+  int64_t dilation_w = test_case.inputs()[8].get_int_value();
+
+  // get_element() materializes a float regardless of dtype (it dequantizes
+  // half master data), so the same loop body serves both FP32 and FP16.
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C_out * H_out * W_out, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t co = 0; co < C_out; ++co) {
+      for (int64_t oh = 0; oh < H_out; ++oh) {
+        for (int64_t ow = 0; ow < W_out; ++ow) {
+          float sum = 0.0f;
+          for (int64_t ci = 0; ci < C_in; ++ci) {
+            for (int64_t kh = 0; kh < K_h; ++kh) {
+              for (int64_t kw = 0; kw < K_w; ++kw) {
+                int64_t ih = oh * stride_h - padding_h + kh * dilation_h;
+                int64_t iw = ow * stride_w - padding_w + kw * dilation_w;
+                if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) {
+                  float in_val = input.get_element(
+                      n * (C_in * H_in * W_in) + ci * (H_in * W_in) +
+                      ih * W_in + iw);
+                  // weight is [C_out, C_in, K_h, K_w]
+                  float w_val = weight.get_element(
+                      co * (C_in * K_h * K_w) + ci * (K_h * K_w) + kh * K_w +
+                      kw);
+                  sum += in_val * w_val;
+                }
+              }
+            }
+          }
+          if (!bias_spec.is_none()) {
+            sum += bias_spec.get_element(co);
+          }
+          ref_data
+              [n * (C_out * H_out * W_out) + co * (H_out * W_out) + oh * W_out +
+               ow] = sum;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv2d_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  utils::GPUMemoryLayout layout = utils::kChannelsPacked;
+
+  // Accuracy shapes (small enough for float reference validation)
+  std::vector<Conv2dTestConfig> accuracy_configs = {
+      // 3x3 stride=1 pad=1 same-channels (the bottleneck pattern in TinyCNN)
+      {InputDims(1, 8, 8, 8),
+       8,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 8, 8),
+       8,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      {InputDims(1, 16, 16, 16),
+       16,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // 3x3 stride=2 (downsample) with channel expansion
+      {InputDims(1, 8, 16, 16),
+       16,
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 3x3 stride=1 with channel reduction
+      {InputDims(1, 16, 8, 8),
+       8,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // Non-multiple-of-4 channels
+      {InputDims(1, 11, 8, 8),
+       13,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 3-channel input (like RGB stem)
+      {InputDims(1, 3, 16, 16),
+       8,
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // TinyCNN depth estimator hotspots (from profiling
+  // UNTRAINED_TinyCNNDepthEstimatorRealTime_Vulkan.pte).
+  // Each entry lists (C_in, H, W) -> C_out, all 3x3 stride=1 pad=1 unless
+  // noted. Together the first 6 entries account for ~89% of all conv time.
+  std::vector<Conv2dTestConfig> perf_configs = {
+      // #1: 21.25% — (1,128,36,48)->(1,128,36,48)
+      {InputDims(1, 128, 36, 48),
+       128,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // #2: 20.68% — (1,256,18,24)->(1,256,18,24)
+      {InputDims(1, 256, 18, 24),
+       256,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // #3: 20.01% — (1,64,72,96)->(1,64,72,96)
+      {InputDims(1, 64, 72, 96),
+       64,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // #4: 13.25% — (1,32,144,192)->(1,32,144,192)
+      {InputDims(1, 32, 144, 192),
+       32,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // #5: 6.74% — (1,64,36,48)->(1,64,36,48)
+      {InputDims(1, 64, 36, 48),
+       64,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // #6: 5.90% — (1,32,72,96)->(1,32,72,96)
+      {InputDims(1, 32, 72, 96),
+       32,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // Secondary cases
+      // 3x3 stride=2 downsample with channel expansion: 1.52%
+      {InputDims(1, 32, 72, 96),
+       128,
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 3x3 stride=1 same-shape, smaller spatial: 1.51%
+      {InputDims(1, 128, 18, 24),
+       128,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // 3x3 stride=1, channel reduction
+      {InputDims(1, 128, 18, 24),
+       64,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 64, 36, 48),
+       32,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 3x3 stride=2 downsample, same channels
+      {InputDims(1, 32, 72, 96),
+       32,
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 64, 36, 48),
+       64,
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // RGB stem
+      {InputDims(1, 3, 144, 192),
+       32,
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // Small shapes used to exercise each im2col intermediate-storage variant
+  // (buffer / texture2d / texture3d) deterministically and independently of
+  // the device's auto-selection. All dims <= kRefDimSizeLimit so the float
+  // reference validates them. For the texture3d case the im2col intermediate
+  // is the channels-packed [1, K_total, H_out, W_out] = [1, 144, 16, 16] for
+  // the 16x16 shape — tiny, so it always fits texture3d even on the small
+  // shape (texture3d would never be naturally selected for a small shape).
+  std::vector<Conv2dTestConfig> per_variant_configs = {
+      // 3x3 s1 p1, channels multiple of 4
+      {InputDims(1, 16, 16, 16),
+       16,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      // Non-multiple-of-4 channels exercise the Cin padding path
+      {InputDims(1, 11, 12, 12),
+       13,
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // Two implementation variants: direct sliding-window (default) and im2col.
+  const std::vector<std::string> impls = {"", "im2col"};
+  // Forced-storage im2col variants for the per-variant ACCU coverage.
+  const std::vector<std::string> forced_storage_impls = {
+      "im2col_buffer", "im2col_tex2d", "im2col_tex3d"};
+
+  // Generate accuracy test cases for both impls and both dtypes. FP16 small
+  // shapes get a real reference check (gated in conv2d_reference_impl); we run
+  // both dtypes so we catch correctness regressions in either path. Large-K
+  // half stays timing-only via the reference's PERF-shape throw.
+  const std::vector<vkapi::ScalarType> accu_dtypes = {
+      vkapi::kFloat, vkapi::kHalf};
+  for (const auto& config : accuracy_configs) {
+    for (auto st : storage_types) {
+      for (auto dtype : accu_dtypes) {
+        for (const auto& impl : impls) {
+          test_cases.push_back(
+              create_conv2d_test_case(config, dtype, st, layout, impl));
+        }
+      }
+    }
+  }
+
+  // Generate per-variant forced-storage ACCU cases (FP32 and FP16) so all
+  // three im2col intermediate-storage variants get deterministic,
+  // device-independent, reference-checked coverage at small K.
+  for (const auto& config : per_variant_configs) {
+    for (auto st : storage_types) {
+      for (auto dtype : accu_dtypes) {
+        for (const auto& impl : forced_storage_impls) {
+          test_cases.push_back(
+              create_conv2d_test_case(config, dtype, st, layout, impl));
+        }
+      }
+    }
+  }
+
+  // Generate performance test cases (float and half) for both impls.
+  for (const auto& config : perf_configs) {
+    std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+    for (auto dtype : dtypes) {
+      for (auto st : storage_types) {
+        for (const auto& impl : impls) {
+          test_cases.push_back(
+              create_conv2d_test_case(config, dtype, st, layout, impl));
+        }
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv2d_flop_calculator(const TestCase& test_case) {
+  auto input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+  auto output_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  int64_t N = output_sizes[0];
+  int64_t C_out = output_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+  int64_t C_in = input_sizes[1];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+
+  return 2 * N * C_out * C_in * H_out * W_out * K_h * K_w;
+}
+
+static void reference_impl(TestCase& test_case) {
+  conv2d_reference_impl(test_case);
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "General Conv2d (SlidingWindow, groups=1) Benchmark"
+            << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  execute_test_cases(
+      generate_conv2d_test_cases,
+      conv2d_flop_calculator,
+      "Conv2d",
+      /*warmup_runs = */ 5,
+      /*benchmark_runs = */ 20,
+      ref_fn);
+
+  return 0;
+}