pytorch · meta-codesync · Jun 6, 2026 · Jun 5, 2026
@@ -11,6 +11,7 @@ non_fbcode_target(_kind = fb_apple_library,
     autoglob_mode = "EXPORT_UNLESS_INTERNAL",
     extension_api_only = True,
     frameworks = [
+        "CoreVideo",
         "Foundation",
     ],
     preprocessor_flags = [
@@ -29,11 +30,13 @@ non_fbcode_target(_kind = fb_apple_library,
     visibility = EXECUTORCH_CLIENTS,
     deps = select({
         "ovr_config//os:macos": [
+            "//xplat/executorch/extension/image:image_processorAppleMac",
             "//xplat/executorch/extension/module:moduleAppleMac",
             "//xplat/executorch/extension/tensor:tensorAppleMac",
             "//xplat/executorch/runtime/platform:platformAppleMac",
         ],
         "DEFAULT": [
+            "//xplat/executorch/extension/image:image_processorApple",
             "//xplat/executorch/extension/module:moduleApple",
             "//xplat/executorch/extension/tensor:tensorApple",
             "//xplat/executorch/runtime/platform:platformApple",

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import CoreVideo
+
+public extension ImageNormalization {
+  /// Create a normalization with a custom scale factor and per-channel RGB mean
+  /// and standard deviation. `mean` and `standardDeviation` must each contain
+  /// exactly 3 elements (R, G, B); every `standardDeviation` entry must be
+  /// nonzero. Applied per channel as
+  /// `(pixel * scaleFactor - mean[c]) / standardDeviation[c]`.
+  convenience init(scaleFactor: Float, mean: [Float], standardDeviation: [Float]) {
+    precondition(mean.count == 3, "mean must have exactly 3 elements (R, G, B)")
+    precondition(
+      standardDeviation.count == 3,
+      "standardDeviation must have exactly 3 elements (R, G, B)")
+    self.init(
+      __scaleFactor: scaleFactor,
+      mean: mean.map { NSNumber(value: $0) },
+      standardDeviation: standardDeviation.map { NSNumber(value: $0) })
+  }
+}
+
+public extension ImageProcessorConfig {
+  /// Source pixel count (width * height) sentinels for `gpuMinInputPixels`.
+  static let alwaysGPU = 0
+  static let alwaysCPU = Int.max
+
+  /// Create an image processor config, specifying only the values that differ
+  /// from the defaults.
+  ///
+  /// `gpuMinInputPixels` is the minimum source pixel count at which the GPU
+  /// path may be used; smaller inputs run on the CPU. Use `.alwaysGPU` (0) or
+  /// `.alwaysCPU` to force a path.
+  convenience init(
+    targetWidth: Int,
+    targetHeight: Int,
+    resizeMode: ImageResizeMode = .stretch,
+    letterboxAnchor: ImageLetterboxAnchor = .center,
+    padValue: Float = 0,
+    normalization: ImageNormalization = .zeroToOne(),
+    gpuMinInputPixels: Int = ImageProcessorConfig.defaultGpuMinInputPixels
+  ) {
+    self.init(
+      __targetWidth: targetWidth,
+      targetHeight: targetHeight,
+      resizeMode: resizeMode,
+      letterboxAnchor: letterboxAnchor,
+      padValue: padValue,
+      normalization: normalization,
+      gpuMinInputPixels: gpuMinInputPixels)
+  }
+}
+
+public extension ImageProcessor {
+  /// Process a CVPixelBuffer into a normalized float tensor.
+  ///
+  /// Auto-detects pixel format from the buffer. Supported formats: BGRA,
+  /// RGBA, 8-bit NV12, and 10-bit P010. Output is a `Tensor<Float>` with
+  /// shape `[1, 3, target_height, target_width]`.
+  ///
+  /// The buffer is treated as already upright: orientation correction is not
+  /// applied and cannot be derived from a CVPixelBuffer, so the caller is
+  /// responsible for supplying an upright buffer.
+  func process(_ pixelBuffer: CVPixelBuffer) throws -> Tensor<Float> {
+    let anyTensor = try processPixelBuffer(pixelBuffer)
+    return Tensor<Float>(anyTensor)
+  }
+
+  /// Process a CVPixelBuffer into a caller-provided tensor, reusing its storage.
+  ///
+  /// Avoids the per-call allocation of `process(_:)`, which matters for
+  /// sustained video. `tensor` must be a `Tensor<Float>` with shape
+  /// `[1, 3, target_height, target_width]`; its storage is overwritten and can
+  /// be reused across frames. The contents are valid until the next call that
+  /// writes into the same tensor.
+  ///
+  /// The buffer is treated as already upright (see `process(_:)`).
+  func process(_ pixelBuffer: CVPixelBuffer, into tensor: Tensor<Float>) throws {
+    try processPixelBuffer(pixelBuffer, into: tensor.anyTensor)
+  }
+
+  /// Letterbox padding (per side, in pixels) applied for a source of the given
+  /// size: `x` is the left/right pad and `y` the top/bottom pad of the resized
+  /// content. Returns `(0, 0)` for the stretch resize mode or the top-left
+  /// anchor. Lets callers map the padded output back to the source region.
+  func computeLetterboxPadding(inputWidth: Int, inputHeight: Int) -> (x: Int, y: Int) {
+    let padding = __computeLetterboxPadding(forInputWidth: inputWidth, height: inputHeight)
+    return (padding.x, padding.y)
+  }
+}
@@ -9,6 +9,7 @@
 #import "ExecuTorchBackendOption.h"
 #import "ExecuTorchBackendOptionsMap.h"
 #import "ExecuTorchError.h"
+#import "ExecuTorchImageProcessor.h"
 #import "ExecuTorchLog.h"
 #import "ExecuTorchModule.h"
 #import "ExecuTorchTensor.h"

@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <CoreVideo/CoreVideo.h>
+#import <Foundation/Foundation.h>
+
+#import "ExecuTorchTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+typedef NS_ENUM(uint8_t, ExecuTorchImageResizeMode) {
+  ExecuTorchImageResizeModeStretch,
+  ExecuTorchImageResizeModeLetterbox,
+} NS_SWIFT_NAME(ImageResizeMode);
+
+typedef NS_ENUM(uint8_t, ExecuTorchImageLetterboxAnchor) {
+  ExecuTorchImageLetterboxAnchorCenter,
+  ExecuTorchImageLetterboxAnchorTopLeft,
+} NS_SWIFT_NAME(ImageLetterboxAnchor);
+
+/// Per-side letterbox padding in pixels: `x` is the left/right pad and `y` the
+/// top/bottom pad of the resized content.
+typedef struct ExecuTorchImageLetterboxPadding {
+  NSInteger x;
+  NSInteger y;
+} ExecuTorchImageLetterboxPadding NS_SWIFT_NAME(ImageLetterboxPadding);
+
+NS_SWIFT_NAME(ImageNormalization)
+__attribute__((objc_subclassing_restricted))
+@interface ExecuTorchImageNormalization : NSObject
+
++ (instancetype)zeroToOne;
++ (instancetype)imagenet;
+
+/// Create a normalization with a custom scale factor and per-channel RGB mean
+/// and standard deviation. `mean` and `standardDeviation` must each contain
+/// exactly 3 elements (R, G, B). Normalization is applied per channel as
+/// `(pixel * scaleFactor - mean[c]) / standardDeviation[c]`, so every
+/// `standardDeviation` entry must be nonzero.
+- (instancetype)initWithScaleFactor:(float)scaleFactor
+                               mean:(NSArray<NSNumber *> *)mean
+                  standardDeviation:(NSArray<NSNumber *> *)standardDeviation
+    NS_REFINED_FOR_SWIFT;
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_SWIFT_NAME(ImageProcessorConfig)
+__attribute__((objc_subclassing_restricted))
+@interface ExecuTorchImageProcessorConfig : NSObject
+
+@property(nonatomic, readonly) NSInteger targetWidth;
+@property(nonatomic, readonly) NSInteger targetHeight;
+@property(nonatomic, readonly) ExecuTorchImageResizeMode resizeMode;
+@property(nonatomic, readonly) ExecuTorchImageLetterboxAnchor letterboxAnchor;
+@property(nonatomic, readonly) float padValue;
+@property(nonatomic, readonly) ExecuTorchImageNormalization *normalization;
+// Minimum source pixel count (width * height) at which the GPU path may be
+// used; smaller inputs run on the CPU. 0 forces GPU, NSIntegerMax forces CPU.
+@property(nonatomic, readonly) NSInteger gpuMinInputPixels;
+
+// Default value for gpuMinInputPixels (mirrors the C++ config default).
+@property(class, nonatomic, readonly) NSInteger defaultGpuMinInputPixels;
+
+- (instancetype)initWithTargetWidth:(NSInteger)targetWidth
+                       targetHeight:(NSInteger)targetHeight
+                          resizeMode:(ExecuTorchImageResizeMode)resizeMode
+                     letterboxAnchor:(ExecuTorchImageLetterboxAnchor)letterboxAnchor
+                            padValue:(float)padValue
+                       normalization:(ExecuTorchImageNormalization *)normalization
+                   gpuMinInputPixels:(NSInteger)gpuMinInputPixels NS_REFINED_FOR_SWIFT;
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+/// Thread-safety: ExecuTorchImageProcessor is NOT thread-safe per instance.
+/// Internal scratch buffers are mutated during processing. Use one instance
+/// per concurrent caller. Different instances are safe to use concurrently.
+NS_SWIFT_NAME(ImageProcessor)
+__attribute__((objc_subclassing_restricted))
+@interface ExecuTorchImageProcessor : NSObject
+
+@property(nonatomic, readonly) ExecuTorchImageProcessorConfig *config;
+
+- (instancetype)initWithConfig:(ExecuTorchImageProcessorConfig *)config;
+
+/// Process a CVPixelBuffer into a normalized float tensor.
+///
+/// Auto-detects pixel format from the buffer's metadata. Supported
+/// formats: BGRA, RGBA, 8-bit NV12, and 10-bit P010 (P010 is narrowed to NV12
+/// internally). Other formats return an error.
+///
+/// The buffer is treated as already upright. Orientation correction is not
+/// applied and cannot be derived from a CVPixelBuffer, so the caller is
+/// responsible for supplying an upright buffer (e.g. by configuring the
+/// capture connection's orientation).
+///
+/// @param pixelBuffer The input pixel buffer.
+/// @param error On failure, set to an NSError describing what went wrong.
+/// @return An ExecuTorchTensor with shape [1, 3, H, W] (CHW), or nil on failure.
+- (nullable ExecuTorchTensor *)processPixelBuffer:(_Nullable CVPixelBufferRef)pixelBuffer
+                                            error:(NSError **)error;
+
+/// Process a CVPixelBuffer into a caller-provided tensor, reusing its storage.
+///
+/// Avoids the per-call output allocation of processPixelBuffer:error:, which
+/// matters for sustained video. `tensor` must be a Float tensor shaped
+/// [1, 3, targetHeight, targetWidth]; its storage is overwritten and can be
+/// reused across frames. The result aliases `tensor`, so the caller must
+/// finish using the previous result before the next call.
+///
+/// @param pixelBuffer The input pixel buffer.
+/// @param tensor The output tensor to fill.
+/// @param error On failure, set to an NSError describing what went wrong.
+/// @return YES on success, NO on failure.
+- (BOOL)processPixelBuffer:(_Nullable CVPixelBufferRef)pixelBuffer
+                intoTensor:(ExecuTorchTensor *)tensor
+                     error:(NSError **)error;
+
+/// Letterbox padding (per side, in pixels) the processor applies for a source
+/// of the given size: `x` is the left/right pad and `y` the top/bottom pad of
+/// the resized content. Returns {0, 0} for the stretch resize mode or the
+/// top-left anchor. Lets callers map the padded output back to the source
+/// region without replicating the resize geometry.
+///
+/// @param inputWidth The source pixel width.
+/// @param inputHeight The source pixel height.
+/// @return The {x, y} padding in pixels.
+- (ExecuTorchImageLetterboxPadding)computeLetterboxPaddingForInputWidth:(NSInteger)inputWidth
+                                                                height:(NSInteger)inputHeight
+    NS_REFINED_FOR_SWIFT;
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END