Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sycl : try to fix SYCL after IQ1_S changes #5995

Merged
merged 3 commits into from Mar 12, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 34 additions & 32 deletions ggml-sycl.cpp
Expand Up @@ -3514,8 +3514,8 @@ static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N
#define QI1_S (QK_K / (4*QR1_S))
typedef struct {
sycl::half d;
uint8_t qs[QK_K/8];
uint8_t scales[QK_K/16];
uint8_t qs[QK_K/8];
uint16_t qh[QK_K/32];
} block_iq1_s;
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");

Expand Down Expand Up @@ -4891,10 +4891,9 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr
template<typename dst_t>
static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1,
const uint64_t *iq1s_grid,
const uint32_t *iq1s_grid,
const uint8_t *ksigns_iq2xs,
const uint8_t *kmask_iq2xs) {

const int i = item_ct1.get_group(2);
const block_iq1_s * x = (const block_iq1_s *) vx;

Expand All @@ -4903,11 +4902,15 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const int i8 = 4*ib+il;
uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
const float d = (float)x[i].d * (2*(h & 7) + 1);
for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
const uint8_t * qs = x[i].qs + 8*ib;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is wrong. There are no signs in IQ1_S and have never been. This bogus implementation has been sitting on the master branch for 2 weeks now. PR #6014 that actually fixes it, has been sitting unreviewed for 2 weeks.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This current implementation does not have any effect on 1qs on our backend, and we are taking a look at the proposed solutions to identify a proper fix. I would not be so confident to say that it fixes it as with the changes mentioned here , I was not able to get the correct runs when compared with nv .
I would also request to avoid using strong language on public PRs if the intention is to collaborate, and in terms of the code it was an older version of your implementation which actually did not give any results on our backend and we are investigating a proper solution.

const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
for (int j = 0; j < 4; ++j) {
abhilash1910 marked this conversation as resolved.
Show resolved Hide resolved
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
}
#else
assert(false);
#endif
Expand Down Expand Up @@ -7803,28 +7806,27 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
static __dpct_inline__ float
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
const uint64_t *iq1s_grid, const uint64_t *ksigns64) {
const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
#if QK_K == 256
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

const int ib32 = iqs;
int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
const uint8_t h1 = bq1->scales[2*ib32+0];
const uint8_t h2 = bq1->scales[2*ib32+1];
const int * q8 = (const int *)bq8_1[ib32].qs;
const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
for (int j = 0; j < 2; ++j) {
sumi1 = dpct::dp4a(q8[j+0], grid1[j], sumi1);
sumi2 = dpct::dp4a(q8[j+2], grid2[j], sumi2);
sumi3 = dpct::dp4a(q8[j+4], grid3[j], sumi3);
sumi4 = dpct::dp4a(q8[j+6], grid4[j], sumi4);
}
const float d = (float)bq1->d * bq8_1[ib32].ds[0];
return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
const uint8_t * qs = bq1->qs + 4*ib32;
const int8_t * q8 = bq8_1[ib32].qs;
int sumi = 0;
for (int l = 0; l < 4; ++l) {
const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
grid[0] ^ signs[0], signs[0], std::minus<>());
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
grid[1] ^ signs[1], signs[1], std::minus<>());
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
q8 += 8;
}
const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
return d * sumi;
#else
assert(false);
return 0.f;
Expand Down Expand Up @@ -8644,7 +8646,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
template <int qk, int qi, typename block_q_t, int vdr>
static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
const sycl::nd_item<3> &item_ct1,
const uint64_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);

Expand Down Expand Up @@ -10406,15 +10408,15 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
{
iq1s_grid.init(*stream);
iq1s_grid_gpu.init(*stream);
ksigns_iq2xs.init(*stream);
kmask_iq2xs.init(*stream);

dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});

stream->submit([&](sycl::handler &cgh) {
auto iq1s_grid_ptr_ct1 = iq1s_grid.get_ptr();
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();

Expand Down Expand Up @@ -11154,11 +11156,11 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
iq1s_grid.init(*stream);
iq1s_grid_gpu.init(*stream);
ksigns64.init(*stream);

stream->submit([&](sycl::handler &cgh) {
auto iq1s_grid_ptr_ct1 = iq1s_grid.get_ptr();
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

cgh.parallel_for(
Expand Down