ggerganov · ikawrakow · Mar 26, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
  { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
  { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
  { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
+ { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
  { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
  { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
  { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
@@ -320,10 +321,12 @@ int main(int argc, char ** argv) {
 
  if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
  params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
- params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
- fprintf(stderr, "\n===============================================================================================\n");
- fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
- fprintf(stderr, "===============================================================================================\n\n\n");
+ params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+ params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
+ params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+ fprintf(stderr, "\n==========================================================================================================\n");
+ fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+ fprintf(stderr, "==========================================================================================================\n\n\n");
  return 1;
  }
 

diff --git a/ggml-common.h b/ggml-common.h
@@ -377,6 +377,14 @@ typedef struct {
 } block_iq1_s;
 static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 
+// 1.8125 bpw
+typedef struct {
+ uint8_t qs[QK_K/8]; // grid index, low 8 bits
+ uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
+ uint8_t scales[QK_K/32]; // 4-bit block scales
+} block_iq1_m;
+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
+
 // Non-linear quants
 #define QK4_NL 32
 typedef struct {
@@ -1050,6 +1058,7 @@ GGML_TABLE_END()
 
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
+#define IQ1M_DELTA 0.125f
 #if defined(GGML_COMMON_IMPL_C)
 GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
  0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -615,6 +615,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
  case GGML_TYPE_IQ2_S:
  case GGML_TYPE_IQ3_XXS:
  case GGML_TYPE_IQ1_S:
+ case GGML_TYPE_IQ1_M:
  case GGML_TYPE_IQ4_NL:
  case GGML_TYPE_IQ4_XS:
  case GGML_TYPE_IQ3_S:
@@ -643,6 +644,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
  case GGML_TYPE_IQ2_S:
  case GGML_TYPE_IQ3_XXS:
  case GGML_TYPE_IQ1_S:
+ case GGML_TYPE_IQ1_M:
  case GGML_TYPE_IQ4_NL:
  case GGML_TYPE_IQ4_XS:
  case GGML_TYPE_IQ3_S:
@@ -2560,7 +2562,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
  ggml_type a_type = a->type;
  if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
  a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
- a_type == GGML_TYPE_IQ2_S  || a_type == GGML_TYPE_IQ4_XS) {
+ a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
  if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
  return false;
  }

diff --git a/ggml-cuda/convert.cu b/ggml-cuda/convert.cu
@@ -501,6 +501,42 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
 
 }
 
+typedef union {
+ half f16;
+ uint16_t u16;
+} iq1m_scale_t;
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+ const int i = blockIdx.x;
+ const block_iq1_m * x = (const block_iq1_m *) vx;
+
+ const int tid = threadIdx.x;
+#if QK_K == 256
+ const int il = tid/8; // 0...3
+ const int ib = tid%8; // 0...7
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+ iq1m_scale_t scale;
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+ const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+ const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+ const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+ grid32[0] &= 0x0f0f0f0f;
+ for (int j = 0; j < 8; ++j) {
+ y[j] = d * (q[j] + delta);
+ }
+#else
+ assert(false);
+#endif
+
+}
+
+
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
@@ -658,6 +694,12 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
  dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 }
 
+template<typename dst_t>
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+ const int nb = k / QK_K;
+ dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
  const int nb = (k + QK_K - 1) / QK_K;
@@ -724,6 +766,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
  return dequantize_row_iq3_xxs_cuda;
  case GGML_TYPE_IQ1_S:
  return dequantize_row_iq1_s_cuda;
+ case GGML_TYPE_IQ1_M:
+ return dequantize_row_iq1_m_cuda;
  case GGML_TYPE_IQ4_NL:
  return dequantize_row_iq4_nl_cuda;
  case GGML_TYPE_IQ4_XS:
@@ -769,6 +813,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
  return dequantize_row_iq3_xxs_cuda;
  case GGML_TYPE_IQ1_S:
  return dequantize_row_iq1_s_cuda;
+ case GGML_TYPE_IQ1_M:
+ return dequantize_row_iq1_m_cuda;
  case GGML_TYPE_IQ4_NL:
  return dequantize_row_iq4_nl_cuda;
  case GGML_TYPE_IQ4_XS:

diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu
@@ -282,6 +282,14 @@ static void mul_mat_vec_iq1_s_q8_1_cuda(
  (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
+static void mul_mat_vec_iq1_m_q8_1_cuda(
+ const void * vx, const void * vy, float * dst,
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+ mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
 static void mul_mat_vec_iq4_nl_q8_1_cuda(
  const void * vx, const void * vy, float * dst,
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
@@ -373,6 +381,9 @@ void ggml_cuda_op_mul_mat_vec_q(
  case GGML_TYPE_IQ1_S:
  mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
  break;
+ case GGML_TYPE_IQ1_M:
+ mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+ break;
  case GGML_TYPE_IQ4_NL:
  mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
  break;

diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh
@@ -1164,6 +1164,54 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 #endif
 }
 
+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if QK_K == 256
+ const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
+
+ const int ib32 = iqs;
+ int sumi[2] = {0, 0};
+ float sumf[2] = {0.f, 0.f};
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+ const int * q8 = (const int *)bq8_1[ib32].qs;
+ for (int l = 0; l < 4; ++l) {
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
+ int grid0 = grid[0] & 0x0f0f0f0f;
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
+ sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2]));
+ const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
+ const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0));
+ sumf[l/2] += delta*sumy;
+ }
+#else
+ const int8_t * q8 = bq8_1[ib32].qs;
+ for (int l = 0; l < 4; ++l) {
+ const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
+ int sumy = 0;
+ for (int j = 0; j < 4; ++j) {
+ sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
+ sumy += q8[j] + q8[j+4];
+ }
+ const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
+ sumf[l/2] += delta*sumy;
+ q8 += 8;
+ }
+#endif
+ typedef union {
+ half f16;
+ uint16_t u16;
+ } iq1m_scale_t;
+ iq1m_scale_t scale;
+ const uint16_t * sc = (const uint16_t *)bq1->scales;
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+ const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
+ return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
+#else
+ assert(false);
+ return 0.f;
+#endif
+}
+
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
  int & val1, int & val2) {