Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ggml-ci
- Loading branch information
Showing
61 changed files
with
9,084 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#include "acc.cuh" | ||
|
||
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, | ||
const int ne10, const int ne11, const int ne12, | ||
const int nb1, const int nb2, int offset) { | ||
const int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (i >= ne) { | ||
return; | ||
} | ||
int src1_idx = i - offset; | ||
int oz = src1_idx / nb2; | ||
int oy = (src1_idx - (oz * nb2)) / nb1; | ||
int ox = src1_idx % nb1; | ||
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { | ||
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; | ||
} else { | ||
dst[i] = x[i]; | ||
} | ||
} | ||
|
||
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, | ||
const int ne10, const int ne11, const int ne12, | ||
const int nb1, const int nb2, const int offset, cudaStream_t stream) { | ||
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; | ||
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); | ||
} | ||
|
||
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | ||
const ggml_tensor * src0 = dst->src[0]; | ||
const ggml_tensor * src1 = dst->src[1]; | ||
const float * src0_d = (const float *)src0->data; | ||
const float * src1_d = (const float *)src1->data; | ||
float * dst_d = (float *)dst->data; | ||
cudaStream_t stream = ctx.stream(); | ||
|
||
GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||
GGML_ASSERT(src1->type == GGML_TYPE_F32); | ||
GGML_ASSERT( dst->type == GGML_TYPE_F32); | ||
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported | ||
|
||
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 | ||
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 | ||
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused | ||
int offset = dst->op_params[3] / 4; // offset in bytes | ||
|
||
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#include "common.cuh" | ||
|
||
#define CUDA_ACC_BLOCK_SIZE 256 | ||
|
||
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#include "alibi.cuh" | ||
|
||
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, | ||
const int n_heads_log2_floor, const float m0, const float m1) { | ||
const int col = blockDim.x*blockIdx.x + threadIdx.x; | ||
|
||
if (col >= ncols) { | ||
return; | ||
} | ||
|
||
const int row = blockDim.y*blockIdx.y + threadIdx.y; | ||
const int i = row*ncols + col; | ||
|
||
const int k = row/k_rows; | ||
|
||
float m_k; | ||
if (k < n_heads_log2_floor) { | ||
m_k = powf(m0, k + 1); | ||
} else { | ||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); | ||
} | ||
|
||
dst[i] = col * m_k + x[i]; | ||
} | ||
|
||
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, | ||
const int k_rows, const int n_heads_log2_floor, const float m0, | ||
const float m1, cudaStream_t stream) { | ||
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1); | ||
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); | ||
const dim3 block_nums(num_blocks_x, nrows, 1); | ||
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1); | ||
} | ||
|
||
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | ||
const ggml_tensor * src0 = dst->src[0]; | ||
const float * src0_d = (const float *)src0->data; | ||
float * dst_d = (float *)dst->data; | ||
cudaStream_t stream = ctx.stream(); | ||
|
||
GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||
GGML_ASSERT( dst->type == GGML_TYPE_F32); | ||
|
||
const int64_t ne00 = src0->ne[0]; | ||
const int64_t ne01 = src0->ne[1]; | ||
const int64_t ne02 = src0->ne[2]; | ||
const int64_t nrows = ggml_nrows(src0); | ||
|
||
//const int n_past = ((int32_t *) dst->op_params)[0]; | ||
const int n_head = ((int32_t *) dst->op_params)[1]; | ||
float max_bias; | ||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); | ||
|
||
//GGML_ASSERT(ne01 + n_past == ne00); | ||
GGML_ASSERT(n_head == ne02); | ||
|
||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); | ||
|
||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); | ||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); | ||
|
||
alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#include "common.cuh" | ||
|
||
#define CUDA_ALIBI_BLOCK_SIZE 32 | ||
|
||
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#include "arange.cuh" | ||
|
||
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { | ||
// blockIDx.x: idx of ne0 / BLOCK_SIZE | ||
int nidx = threadIdx.x + blockIdx.x * blockDim.x; | ||
if (nidx >= ne0) { | ||
return; | ||
} | ||
dst[nidx] = start + step * nidx; | ||
} | ||
|
||
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { | ||
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; | ||
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step); | ||
} | ||
|
||
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | ||
float * dst_d = (float *)dst->data; | ||
cudaStream_t stream = ctx.stream(); | ||
|
||
GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||
|
||
float start; | ||
float stop; | ||
float step; | ||
memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); | ||
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); | ||
memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); | ||
|
||
int64_t steps = (int64_t)ceil((stop - start) / step); | ||
GGML_ASSERT(ggml_nelements(dst) == steps); | ||
|
||
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#include "common.cuh" | ||
|
||
#define CUDA_ARANGE_BLOCK_SIZE 256 | ||
|
||
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#include "argsort.cuh" | ||
|
||
template<typename T> | ||
static inline __device__ void ggml_cuda_swap(T & a, T & b) { | ||
T tmp = a; | ||
a = b; | ||
b = tmp; | ||
} | ||
|
||
template<ggml_sort_order order> | ||
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) { | ||
// bitonic sort | ||
int col = threadIdx.x; | ||
int row = blockIdx.y; | ||
|
||
if (col >= ncols) return; | ||
|
||
const float * x_row = x + row * ncols; | ||
int * dst_row = dst + row * ncols; | ||
|
||
// initialize indices | ||
if (col < ncols) { | ||
dst_row[col] = col; | ||
} | ||
__syncthreads(); | ||
|
||
for (int k = 2; k <= ncols; k *= 2) { | ||
for (int j = k / 2; j > 0; j /= 2) { | ||
int ixj = col ^ j; | ||
if (ixj > col) { | ||
if ((col & k) == 0) { | ||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { | ||
ggml_cuda_swap(dst_row[col], dst_row[ixj]); | ||
} | ||
} else { | ||
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { | ||
ggml_cuda_swap(dst_row[col], dst_row[ixj]); | ||
} | ||
} | ||
} | ||
__syncthreads(); | ||
} | ||
} | ||
} | ||
|
||
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { | ||
// bitonic sort requires ncols to be power of 2 | ||
GGML_ASSERT((ncols & (ncols - 1)) == 0); | ||
|
||
const dim3 block_dims(ncols, 1, 1); | ||
const dim3 block_nums(1, nrows, 1); | ||
if (order == GGML_SORT_ORDER_ASC) { | ||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); | ||
} else if (order == GGML_SORT_ORDER_DESC) { | ||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols); | ||
} else { | ||
GGML_ASSERT(false); | ||
} | ||
} | ||
|
||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | ||
const ggml_tensor * src0 = dst->src[0]; | ||
const float * src0_d = (const float *)src0->data; | ||
float * dst_d = (float *)dst->data; | ||
cudaStream_t stream = ctx.stream(); | ||
|
||
GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||
GGML_ASSERT( dst->type == GGML_TYPE_I32); | ||
GGML_ASSERT(ggml_is_contiguous(src0)); | ||
|
||
const int64_t ncols = src0->ne[0]; | ||
const int64_t nrows = ggml_nrows(src0); | ||
|
||
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; | ||
|
||
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#include "common.cuh" | ||
|
||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); |
Oops, something went wrong.