microsoft · Copilot · May 11, 2026 · May 11, 2026 · May 12, 2026 · May 13, 2026
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -61,29 +61,17 @@ class QLinearConv : public OpKernel {
   static void ComputeOffset(OpKernelContext* context,
                             int64_t M,
                             ActType& X_zero_point_value,
-                            ActType& Y_zero_point_value,
-                            uint8_t& W_zero_point_value) {
+                            ActType& Y_zero_point_value) {
     const Tensor* X_zero_point = context->Input<Tensor>(InputTensors::IN_X_ZERO_POINT);
-    const Tensor* W_zero_point = context->Input<Tensor>(InputTensors::IN_W_ZERO_POINT);
     const Tensor* Y_zero_point = context->Input<Tensor>(InputTensors::IN_Y_ZERO_POINT);
     ORT_ENFORCE(IsScalarOr1ElementVector(X_zero_point),
                 "QLinearConv : input zero point must be a scalar or 1D tensor of size 1");
     ORT_ENFORCE(IsScalarOr1ElementVector(Y_zero_point),
                 "QLinearConv : result zero point must be a scalar or 1D tensor of size 1");
-    ORT_ENFORCE(IsValidQuantParam(W_zero_point, M),
-                "QLinearConv : filter zero point shape invalid");
 
     X_zero_point_value = *(X_zero_point->Data<ActType>());
     Y_zero_point_value = *(Y_zero_point->Data<ActType>());
-
-    const int64_t W_zero_point_size = W_zero_point->Shape().Size();
-    const auto* W_zero_point_data = static_cast<const uint8_t*>(W_zero_point->DataRaw());
-    W_zero_point_value = W_zero_point_data[0];
-    for (int64_t i = 1; i < W_zero_point_size; i++) {
-      ORT_ENFORCE(W_zero_point_data[i] == W_zero_point_value,
-                  "QLinearConv : zero point of per-channel filter must be same. "
-                  "This happens by design if the quantization is symmetric.");
-    }
+    ORT_UNUSED_PARAMETER(M);
   }
 
   static std::vector<float> ComputeOutputScale(OpKernelContext* context,
@@ -531,10 +519,27 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
 
   ActType X_zero_point_value;
   ActType Y_zero_point_value;
-  uint8_t W_zero_point_value;
-  ComputeOffset(context, M, X_zero_point_value, Y_zero_point_value, W_zero_point_value);
+  ComputeOffset(context, M, X_zero_point_value, Y_zero_point_value);
   std::vector<float> output_scales = ComputeOutputScale(context, M);
 
+  // Read weight zero points (may be scalar or per-channel).
+  const Tensor* W_zero_point = context->Input<Tensor>(InputTensors::IN_W_ZERO_POINT);
+  ORT_ENFORCE(IsValidQuantParam(W_zero_point, M), "QLinearConv : filter zero point shape invalid");
+  const int64_t W_zero_point_size = W_zero_point->Shape().Size();
+  const auto* W_zero_point_data = static_cast<const uint8_t*>(W_zero_point->DataRaw());
+  // Per-channel zero points are uniform when size == 1 or all values match.
+  const bool W_zero_point_is_uniform =
+      (W_zero_point_size <= 1) ||
+      std::all_of(W_zero_point_data + 1, W_zero_point_data + W_zero_point_size,
+                  [W_zero_point_data](uint8_t v) { return v == W_zero_point_data[0]; });
+  // When non-uniform, w_zero_point must be a full per-channel tensor of size M
+  // so that group_id * group_output_channels indexing is in bounds.
+  ORT_ENFORCE(W_zero_point_is_uniform || W_zero_point_size == M,
+              "QLinearConv : non-uniform weight zero point tensor size (", W_zero_point_size,
+              ") must equal number of output channels (", M, ")");
+  // Single representative value used for paths that require a scalar zero point.
+  const uint8_t W_zero_point_value = W_zero_point_data[0];
+
   const Tensor* B = context->Input<Tensor>(InputTensors::IN_BIAS);
 
   ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W_shape, channels_last_));
@@ -610,7 +615,11 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
   int64_t group_output_channels = M / group_count;
 
   // Test for depthwise convolution.
-  const bool is_depthwise_conv = ((is_symmetric_conv_ || reordered_W != nullptr) && group_input_channels == 1 && group_output_channels == 1);
+  // Depthwise path requires a single (uniform) filter zero point because
+  // MlasConvDepthwise accepts only a scalar FilterZeroPoint.
+  const bool is_depthwise_conv = (W_zero_point_is_uniform &&
+                                  (is_symmetric_conv_ || reordered_W != nullptr) &&
+                                  group_input_channels == 1 && group_output_channels == 1);
   if (is_depthwise_conv) {
     // Update the input and output channels to the number of groups in order to
     // reuse as much of the below standard convolution path.
@@ -974,7 +983,10 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
               gemm_params.B = reordered_W + group_id * group_output_channels,
               gemm_params.ldb = static_cast<size_t>(M);
             }
-            gemm_params.ZeroPointB = &W_zero_point_value;
+            gemm_params.ZeroPointB = !W_zero_point_is_uniform
+                                         ? W_zero_point_data + group_id * group_output_channels
+                                         : &W_zero_point_value;
+            gemm_params.PerColumnZeroPoints = !W_zero_point_is_uniform;
             gemm_params.C = worker_gemm_output + group_id * group_output_channels;
             gemm_params.ldc = static_cast<size_t>(M);
 

diff --git a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
@@ -276,6 +276,7 @@ class QLinearConvOpTester {
     std::vector<int64_t> shape_;
     std::vector<float> scale_;
     T zero_point_{0};
+    std::vector<T> zero_points_;  // per-channel zero points (empty = use zero_point_)
   };
 
   std::default_random_engine generator_{1234};
@@ -406,7 +407,6 @@ class QLinearConvOpTester {
     const int64_t kernel_size = std::accumulate(
         kernel_shape, kernel_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
     const int32_t X_zero_point = X_.zero_point_;
-    const int32_t W_zero_point = W_.zero_point_;
 
     const ActType* Xdata = X_.data_.data();
     ActType* Ydata = Y_data.data();
@@ -423,6 +423,10 @@ class QLinearConvOpTester {
           int32_t bias = B_.empty() ? 0 : B_[channel_index];
           float weight_scale = W_.scale_[(W_.scale_.size() == 1) ? 0 : channel_index];
           float requantize_scale = (X_.scale_[0] * weight_scale) / output_scale_;
+          // Use per-channel zero point if available, otherwise use the single zero_point_.
+          const int32_t W_zero_point = W_.zero_points_.empty()
+                                           ? static_cast<int32_t>(W_.zero_point_)
+                                           : static_cast<int32_t>(W_.zero_points_[channel_index]);
 
           std::vector<int64_t> d_output(kernel_rank, 0);
           std::vector<int64_t> d_kernel(kernel_rank, 0);
@@ -476,7 +480,12 @@ class QLinearConvOpTester {
     const std::vector<int64_t> W_scale_shape{static_cast<int64_t>(W_.scale_.size())};
     test.AddInput<FilterType>("w", W_.shape_, W_.data_, all_input_initializer_except_x);
     test.AddInput<float>("w_scale", W_scale_shape, W_.scale_, all_input_initializer_except_x);
-    test.AddInput<FilterType>("w_zero_point", {}, {W_.zero_point_}, all_input_initializer_except_x);
+    if (!W_.zero_points_.empty()) {
+      const std::vector<int64_t> W_zp_shape{static_cast<int64_t>(W_.zero_points_.size())};
+      test.AddInput<FilterType>("w_zero_point", W_zp_shape, W_.zero_points_, all_input_initializer_except_x);
+    } else {
+      test.AddInput<FilterType>("w_zero_point", {}, {W_.zero_point_}, all_input_initializer_except_x);
+    }
 
     test.AddInput<float>("y_scale", {}, {output_scale_}, all_input_initializer_except_x);
     test.AddInput<ActType>("y_zero_point", {}, {output_zero_point_}, all_input_initializer_except_x);
@@ -543,6 +552,10 @@ class QLinearConvOpTester {
     W_.scale_ = scales;
   }
 
+  void SetWeightZeroPoints(const std::vector<FilterType>& zero_points) {
+    W_.zero_points_ = zero_points;
+  }
+
   void GenerateRandomBias() {
     ORT_ENFORCE(W_.shape_.size() >= 1);
     const size_t output_channels = static_cast<size_t>(W_.shape_[0]);
@@ -1507,6 +1520,149 @@ TEST(QLinearConvTest, Conv2D_S8S8_Requantize_Bias_PerChannel) {
   }
 }
 
+// Tests per-channel weight zero points with different values (the fix for the reported bug).
+TEST(QLinearConvTest, Conv2D_U8U8_PerChannelZeroPoints) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect.";
+  }
+
+  for (int64_t channels : std::initializer_list<int64_t>{2, 4, 8, 16, 32}) {
+    QLinearConvOpTester<uint8_t, uint8_t> test;
+    test.GenerateRandomInput({1, 3, 9, 9}, .05f, 128);
+    test.GenerateRandomWeights({channels, 3, 3, 3}, .10f, 128);
+    std::vector<float> weight_scales;
+    std::vector<uint8_t> weight_zero_points;
+    for (int64_t i = 0; i < channels; i++) {
+      weight_scales.push_back(.10f + static_cast<float>(i) * .002f);
+      // Use different zero points per channel to exercise the per-channel path.
+      weight_zero_points.push_back(static_cast<uint8_t>(100 + i * 5));
+    }
+    test.SetWeightScales(weight_scales);
+    test.SetWeightZeroPoints(weight_zero_points);
+    test.GenerateRandomBias();
+    test.SetPads({1, 1, 1, 1});
+    test.SetOutputScaleAndZeroPoint(.55f, 128);
+    test.Run();
+  }
+}
+
+// Tests per-channel weight zero points with different values for int8 activations.
+TEST(QLinearConvTest, Conv2D_S8S8_PerChannelZeroPoints) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect.";
+  }
+
+  for (int64_t channels : std::initializer_list<int64_t>{2, 4, 8, 16, 32}) {
+    QLinearConvOpTester<int8_t, int8_t> test;
+    test.GenerateRandomInput({1, 4, 7, 7}, .05f, 4);
+    test.GenerateRandomWeights({channels, 4, 3, 3}, .10f, 0);
+    std::vector<float> weight_scales;
+    std::vector<int8_t> weight_zero_points;
+    for (int64_t i = 0; i < channels; i++) {
+      weight_scales.push_back(.10f + static_cast<float>(i) * .003f);
+      // Use different (non-zero) zero points per channel.
+      weight_zero_points.push_back(static_cast<int8_t>(-10 + i * 3));
+    }
+    test.SetWeightScales(weight_scales);
+    test.SetWeightZeroPoints(weight_zero_points);
+    test.GenerateRandomBias();
+    test.SetPads({1, 1, 1, 1});
+    test.SetOutputScaleAndZeroPoint(.55f, -8);
+    test.Run();
+  }
+}
+
+// Tests per-channel weight zero points for grouped convolution.
+TEST(QLinearConvTest, Conv2D_U8U8_Groups_PerChannelZeroPoints) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect.";
+  }
+
+  for (int64_t channels : std::initializer_list<int64_t>{4, 8, 16}) {
+    QLinearConvOpTester<uint8_t, uint8_t> test;
+    test.GenerateRandomInput({1, 8, 9, 9}, .05f, 128);
+    test.GenerateRandomWeights({channels, 4, 3, 3}, .10f, 128);
+    std::vector<float> weight_scales;
+    std::vector<uint8_t> weight_zero_points;
+    for (int64_t i = 0; i < channels; i++) {
+      weight_scales.push_back(.10f + static_cast<float>(i) * .002f);
+      weight_zero_points.push_back(static_cast<uint8_t>(80 + i * 7));
+    }
+    test.SetWeightScales(weight_scales);
+    test.SetWeightZeroPoints(weight_zero_points);
+    test.GenerateRandomBias();
+    test.SetPads({1, 1, 1, 1});
+    test.SetGroups(2);
+    test.SetOutputScaleAndZeroPoint(.55f, 128);
+    test.Run();
+  }
+}
+
+// Depthwise config (groups == channels) with non-uniform per-channel weight zero points.
+// The kernel cannot use MlasConvDepthwise with distinct ZPs, so this validates the
+// automatic fallback to the group-GEMM path.
+TEST(QLinearConvTest, Conv2D_S8S8_DepthwiseFallback_PerChannelZeroPoints) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect.";
+  }
+
+  constexpr int64_t channels = 8;
+
+  QLinearConvOpTester<int8_t, int8_t> test;
+  test.GenerateRandomInput({1, channels, 9, 9}, .05f, 4);
+  test.GenerateRandomWeights({channels, 1, 3, 3}, .10f, 0);
+
+  std::vector<float> weight_scales;
+  std::vector<int8_t> weight_zero_points;
+  for (int64_t i = 0; i < channels; ++i) {
+    weight_scales.push_back(.10f + static_cast<float>(i) * .003f);
+    weight_zero_points.push_back(static_cast<int8_t>(-10 + i * 3));
+  }
+
+  test.SetWeightScales(weight_scales);
+  test.SetWeightZeroPoints(weight_zero_points);
+  test.GenerateRandomBias();
+  test.SetPads({1, 1, 1, 1});
+  test.SetGroups(channels);
+  test.SetOutputScaleAndZeroPoint(.55f, -8);
+  test.Run();
+}
+
+// Depthwise config (groups == channels) with a full per-channel zero-point tensor whose
+// values are all identical. This should still use the depthwise fast path because the
+// zero points are uniform even though the tensor shape is per-channel.
+TEST(QLinearConvTest, Conv2D_S8S8_Depthwise_PerChannelUniformZeroPoints) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect.";
+  }
+
+  constexpr int64_t channels = 8;
+
+  QLinearConvOpTester<int8_t, int8_t> test;
+  test.GenerateRandomInput({1, channels, 9, 9}, .05f, 4);
+  test.GenerateRandomWeights({channels, 1, 3, 3}, .10f, 0);
+
+  std::vector<float> weight_scales;
+  std::vector<int8_t> weight_zero_points;
+  for (int64_t i = 0; i < channels; ++i) {
+    weight_scales.push_back(.10f + static_cast<float>(i) * .003f);
+    weight_zero_points.push_back(static_cast<int8_t>(-7));
+  }
+
+  test.SetWeightScales(weight_scales);
+  test.SetWeightZeroPoints(weight_zero_points);
+  test.GenerateRandomBias();
+  test.SetPads({1, 1, 1, 1});
+  test.SetGroups(channels);
+  test.SetOutputScaleAndZeroPoint(.55f, -8);
+  test.Run();
+}
+
 TEST(QLinearConvTest, Conv2D_S8S8_Depthwise_Kernelsize) {
   TestQLinearConv2dDepthwiseKernelsize<int8_t, int8_t>();
 }