Skip to content

Commit

Permalink
Merge branch 'master' into concedo_experimental
Browse files Browse the repository at this point in the history
  • Loading branch information
LostRuins committed Jan 3, 2024
2 parents 234f79f + 7bed7eb commit d37c94b
Show file tree
Hide file tree
Showing 8 changed files with 249 additions and 17 deletions.
9 changes: 9 additions & 0 deletions examples/server/public/completion.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ export async function* llama(prompt, params = {}, config = {}) {
break;
}
}
if (result.error) {
result.error = JSON.parse(result.error);
if (result.error.content.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error: ${result.error.content}`);
}
}
if (result.error) {
result.error = JSON.parse(result.error);
console.error(`llama.cpp error: ${result.error.content}`);
Expand Down
10 changes: 7 additions & 3 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10042,14 +10042,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
}
return false;
} break;
case GGML_OP_DUP:
case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_REPEAT:
case GGML_OP_DUP:
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
Expand All @@ -10066,7 +10071,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
case GGML_OP_CONCAT:
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
Expand Down
4 changes: 4 additions & 0 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(get_rows_i32);
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(group_norm);
GGML_METAL_DECL_KERNEL(norm);
Expand Down Expand Up @@ -377,6 +378,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(get_rows_i32);
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(group_norm);
GGML_METAL_ADD_KERNEL(norm);
Expand Down Expand Up @@ -499,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
GGML_METAL_DEL_KERNEL(get_rows_i32);
GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(group_norm);
GGML_METAL_DEL_KERNEL(norm);
Expand Down Expand Up @@ -1978,6 +1981,7 @@ void ggml_metal_graph_compute(
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
default: GGML_ASSERT(false && "not implemented");
}

Expand Down
29 changes: 29 additions & 0 deletions ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -3829,6 +3829,35 @@ kernel void kernel_get_rows_f16(
}
}

kernel void kernel_get_rows_i32(
device const void * src0,
device const char * src1,
device int32_t * dst,
constant int64_t & ne00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb1,
constant uint64_t & nb2,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg [[threads_per_threadgroup]]) {
const int64_t i10 = tgpig.x;
const int64_t i11 = tgpig.y;

const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];

const int64_t i02 = i11;

for (int ind = tiitg; ind < ne00; ind += tptg.x) {
((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
}
}


#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
#define BLOCK_SIZE_K 32
Expand Down
166 changes: 162 additions & 4 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4766,8 +4766,11 @@ struct ggml_tensor * ggml_get_rows(
}

// TODO: implement non F32 return
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
enum ggml_type type = GGML_TYPE_F32;
if (a->type == GGML_TYPE_I32) {
type = a->type;
}
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);

result->op = GGML_OP_GET_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
Expand Down Expand Up @@ -6938,14 +6941,165 @@ static void ggml_compute_forward_dup_f32(
}
}

static void ggml_compute_forward_dup(
// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
static void ggml_compute_forward_dup_bytes(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(src0->type == dst->type);

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}

if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, src0, dst);
return;
}

GGML_TENSOR_UNARY_OP_LOCALS;

const size_t type_size = ggml_type_size(src0->type);
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads


// parallelize by rows
const int nr = ne01;
// number of rows per thread
const int dr = (nr + nth - 1) / nth;
// row range for this thread
const int ir0 = dr * ith;
const int ir1 = MIN(ir0 + dr, nr);

if (src0->type == dst->type &&
ne00 == ne0 &&
nb00 == type_size && nb0 == type_size) {
// copy by rows
const size_t rs = ne00 * type_size;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ir0; i01 < ir1; i01++) {
memcpy(
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
rs);
}
}
}
return;
}

if (ggml_is_contiguous(dst)) {
size_t id = 0;
char * dst_ptr = (char *) dst->data;
const size_t rs = ne00 * type_size;

if (nb00 == type_size) {
// src0 is contigous on first dimension, copy by rows
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
for (int64_t i01 = ir0; i01 < ir1; i01++) {
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
memcpy(dst_ptr + id, src0_ptr, rs);
id += rs;
}
id += rs * (ne01 - ir1);
}
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);

for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
for (int64_t i01 = ir0; i01 < ir1; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
memcpy(dst_ptr + id, src0_ptr, type_size);

id += type_size;
}
}
id += rs * (ne01 - ir1);
}
}
}

return;
}

// dst counters

int64_t i10 = 0;
int64_t i11 = 0;
int64_t i12 = 0;
int64_t i13 = 0;

for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
i10 += ne00 * ir0;
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
for (int64_t i01 = ir0; i01 < ir1; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);

memcpy(dst_ptr, src0_ptr, type_size);

if (++i10 == ne0) {
i10 = 0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
i10 += ne00 * (ne01 - ir1);
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
}

static void ggml_compute_forward_dup(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (src0->type == dst->type) {
ggml_compute_forward_dup_bytes(params, src0, dst);
return;
}

switch (src0->type) {
case GGML_TYPE_F16:
{
Expand Down Expand Up @@ -8404,10 +8558,12 @@ static void ggml_compute_forward_repeat(
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
case GGML_TYPE_I16:
{
ggml_compute_forward_repeat_f16(params, src0, dst);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
ggml_compute_forward_repeat_f32(params, src0, dst);
} break;
Expand Down Expand Up @@ -8550,6 +8706,7 @@ static void ggml_compute_forward_concat(
struct ggml_tensor* dst) {
switch (src0->type) {
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
ggml_compute_forward_concat_f32(params, src0, src1, dst);
} break;
Expand Down Expand Up @@ -10674,6 +10831,7 @@ static void ggml_compute_forward_get_rows(
ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
} break;
Expand Down
4 changes: 1 addition & 3 deletions scripts/sync-ggml-am.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ echo "Syncing ggml changes since commit $lc"
cd $SRC_GGML

git log --oneline $lc..HEAD
git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
git log --oneline $lc..HEAD --reverse | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits

if [ ! -s $SRC_LLAMA/ggml-commits ]; then
rm -v $SRC_LLAMA/ggml-commits
Expand Down Expand Up @@ -87,7 +87,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-impl.h -> ggml-impl.h
# src/ggml-metal.h -> ggml-metal.h
# src/ggml-metal.m -> ggml-metal.m
# src/ggml-metal.metal -> ggml-metal.metal
# src/ggml-mpi.h -> ggml-mpi.h
# src/ggml-mpi.c -> ggml-mpi.c
# src/ggml-opencl.cpp -> ggml-opencl.cpp
Expand All @@ -114,7 +113,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
Expand Down
2 changes: 1 addition & 1 deletion scripts/sync-ggml.last
Original file line number Diff line number Diff line change
@@ -1 +1 @@
df098ea908764cba4a4889a1cbe7b026b2d31a14
3fd01e00e40583ccd4b393a7c6502d6a4455a1d5
Loading

0 comments on commit d37c94b

Please sign in to comment.