diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 8d1a85c96939b..3fa059b6c8040 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14236,20 +14236,11 @@ size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { - ggml_tensor * tensor = cgraph->nodes[tensor_idx]; + ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops]; if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { return; } - bool fused_rms_norm_mul = false; - int rms_norm_idx = -1; - if (ctx->num_additional_fused_ops == 1 && - tensor->op == GGML_OP_RMS_NORM && - cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) { - fused_rms_norm_mul = true; - tensor = cgraph->nodes[tensor_idx + 1]; - } - check_counter++; if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { return; @@ -14257,9 +14248,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")"); - ggml_tensor * src0 = tensor->src[0]; - ggml_tensor * src1 = tensor->src[1]; - struct ggml_init_params iparams = { /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, /*.mem_buffer =*/ NULL, @@ -14269,328 +14257,339 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * struct ggml_context * ggml_ctx = ggml_init(iparams); std::array src_clone = {nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; - std::array src_size = {}; - std::array src_buffer = {}; const char * srci_name[GGML_MAX_SRC] = {"src0", "src1", "src2", "src3", "src4", "src5", "src6", "src7", "src8", "src9"}; + std::map cloned_tensors; + std::vector cloned_mallocs; + struct ggml_tensor * tensor_clone = nullptr; - for (int i = 0; i < GGML_MAX_SRC; i++) { - ggml_tensor * srci = tensor->src[i]; - if (fused_rms_norm_mul) { - rms_norm_idx = tensor->src[0]->op == GGML_OP_RMS_NORM ? 0 : 1; - ggml_tensor *rms_norm = tensor->src[rms_norm_idx]; - switch (i) { - case 0: srci = rms_norm->src[0]; break; - case 1: srci = tensor->src[1 - rms_norm_idx]; break; - default: continue; + for (int f = 0; f < ctx->num_additional_fused_ops + 1; ++f) { + tensor = cgraph->nodes[tensor_idx + f]; + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * srci = tensor->src[i]; + if (srci == nullptr) { + continue; } - } - if (srci == nullptr) { - continue; - } - ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci); - size_t srci_size = ggml_nbytes(srci); - - src_clone[i] = srci_clone; - src_size[i] = ggml_nbytes(srci); - src_buffer[i] = malloc(srci_size); - - srci_clone->data = src_buffer[i]; - if (ggml_backend_buffer_is_host(srci->buffer)) { - memcpy(srci_clone->data, srci->data, srci_size); - memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (ggml_backend_buffer_is_vk(srci->buffer)) { - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context; - vk_buffer& buffer_gpu = buf_ctx->dev_buffer; - uint64_t offset = vk_tensor_offset(srci) + srci->view_offs; - if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) { - for (int i3 = 0; i3 < srci->ne[3]; i3++) { - for (int i2 = 0; i2 < srci->ne[2]; i2++) { - const int idx = i3*srci->ne[2] + i2; - ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]); + // If a src tensor has been cloned, use that one + auto it = cloned_tensors.find(srci); + if (it != cloned_tensors.end()) { + src_clone[i] = it->second; + continue; + } + ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci); + size_t srci_size = ggml_nbytes(srci); + + src_clone[i] = srci_clone; + void *src_buffer = malloc(srci_size); + cloned_mallocs.push_back(src_buffer); + + srci_clone->data = src_buffer; + if (ggml_backend_buffer_is_host(srci->buffer)) { + memcpy(srci_clone->data, srci->data, srci_size); + memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); + } else if (ggml_backend_buffer_is_vk(srci->buffer)) { + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(srci) + srci->view_offs; + if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) { + for (int i3 = 0; i3 < srci->ne[3]; i3++) { + for (int i2 = 0; i2 < srci->ne[2]; i2++) { + const int idx = i3*srci->ne[2] + i2; + ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]); + } } - } - srci_clone->nb[0] = srci->nb[0]; - srci_clone->nb[1] = srci->nb[1]; - for (int i = 2; i < GGML_MAX_DIMS; i++) { - srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1]; + srci_clone->nb[0] = srci->nb[0]; + srci_clone->nb[1] = srci->nb[1]; + for (int i = 2; i < GGML_MAX_DIMS; i++) { + srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1]; + } + } else { + if (offset + srci_size >= buffer_gpu->size) { + srci_size = buffer_gpu->size - offset; + } + ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size); + memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - if (offset + srci_size >= buffer_gpu->size) { - srci_size = buffer_gpu->size - offset; - } - ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size); - memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); + GGML_ABORT("fatal error"); } - } else { - GGML_ABORT("fatal error"); - } - if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(srci, srci_name[i]); + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(srci, srci_name[i]); + } } - } - if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]); - if (src_clone[4]) { - ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]); - } - } else if (tensor->op == GGML_OP_MUL_MAT) { - tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_MUL_MAT_ID) { - tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); - } else if (tensor->op == GGML_OP_SUB) { - tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_MUL) { - if (fused_rms_norm_mul) { - tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params); - tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]); - } else { + if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]); + if (src_clone[4]) { + ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]); + } + } else if (tensor->op == GGML_OP_MUL_MAT) { + tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_MUL_MAT_ID) { + tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); + } else if (tensor->op == GGML_OP_SUB) { + tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_MUL) { tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]); - } - } else if (tensor->op == GGML_OP_DIV) { - tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_CONCAT) { - tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params); - } else if (tensor->op == GGML_OP_UPSCALE) { - tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]); - } else if (tensor->op == GGML_OP_SCALE) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]); - } else if (tensor->op == GGML_OP_SQR) { - tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_SQRT) { - tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_SIN) { - tensor_clone = ggml_sin(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_COS) { - tensor_clone = ggml_cos(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_CLAMP) { - const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); - } else if (tensor->op == GGML_OP_PAD) { - tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3], - tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]); - } else if (tensor->op == GGML_OP_REPEAT) { - tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor); - } else if (tensor->op == GGML_OP_REPEAT_BACK) { - tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor); - } else if (tensor->op == GGML_OP_ADD) { - tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_ACC) { - tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); - } else if (tensor->op == GGML_OP_NORM) { - tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); - } else if (tensor->op == GGML_OP_GROUP_NORM) { - const float * float_params = (const float *)tensor->op_params; - tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]); - } else if (tensor->op == GGML_OP_RMS_NORM) { - tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); - } else if (tensor->op == GGML_OP_RMS_NORM_BACK) { - const float eps = ((float *) tensor->op_params)[0]; - tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps); - } else if (tensor->op == GGML_OP_SILU_BACK) { - tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_L2_NORM) { - const float eps = ((float *) tensor->op_params)[0]; - tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps); - } else if (tensor->op == GGML_OP_SOFT_MAX) { - if (src1 != nullptr) { + } else if (tensor->op == GGML_OP_DIV) { + tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_CONCAT) { + tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params); + } else if (tensor->op == GGML_OP_UPSCALE) { + tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]); + } else if (tensor->op == GGML_OP_SCALE) { const float * params = (const float *)tensor->op_params; - tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]); - } else { - tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]); - } - } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) { - tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); - } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { - tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]); - } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) { - const int n_dims = ((int32_t *) tensor->op_params)[1]; - const int mode = ((int32_t *) tensor->op_params)[2]; - //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; - const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; - const float freq_base = ((float *) tensor->op_params)[5]; - const float freq_scale = ((float *) tensor->op_params)[6]; - const float ext_factor = ((float *) tensor->op_params)[7]; - const float attn_factor = ((float *) tensor->op_params)[8]; - const float beta_fast = ((float *) tensor->op_params)[9]; - const float beta_slow = ((float *) tensor->op_params)[10]; - if (mode & GGML_ROPE_TYPE_MROPE) { - int32_t *sections = ((int32_t *) tensor->op_params) + 11; - if (tensor->op == GGML_OP_ROPE) { - tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]); + } else if (tensor->op == GGML_OP_SQR) { + tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SQRT) { + tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SIN) { + tensor_clone = ggml_sin(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_COS) { + tensor_clone = ggml_cos(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_CLAMP) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); + } else if (tensor->op == GGML_OP_PAD) { + tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3], + tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]); + } else if (tensor->op == GGML_OP_REPEAT) { + tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor); + } else if (tensor->op == GGML_OP_REPEAT_BACK) { + tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor); + } else if (tensor->op == GGML_OP_ADD) { + tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ACC) { + tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); + } else if (tensor->op == GGML_OP_NORM) { + tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); + } else if (tensor->op == GGML_OP_GROUP_NORM) { + const float * float_params = (const float *)tensor->op_params; + tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]); + } else if (tensor->op == GGML_OP_RMS_NORM) { + tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); + } else if (tensor->op == GGML_OP_RMS_NORM_BACK) { + const float eps = ((float *) tensor->op_params)[0]; + tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps); + } else if (tensor->op == GGML_OP_SILU_BACK) { + tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_L2_NORM) { + const float eps = ((float *) tensor->op_params)[0]; + tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps); + } else if (tensor->op == GGML_OP_SOFT_MAX) { + if (tensor->src[1] != nullptr) { + const float * params = (const float *)tensor->op_params; + tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]); } else { - tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]); } - } else { - if (tensor->op == GGML_OP_ROPE) { - tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) { + tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); + } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { + tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]); + } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) { + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + //const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3]; + const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4]; + const float freq_base = ((float *) tensor->op_params)[5]; + const float freq_scale = ((float *) tensor->op_params)[6]; + const float ext_factor = ((float *) tensor->op_params)[7]; + const float attn_factor = ((float *) tensor->op_params)[8]; + const float beta_fast = ((float *) tensor->op_params)[9]; + const float beta_slow = ((float *) tensor->op_params)[10]; + if (mode & GGML_ROPE_TYPE_MROPE) { + int32_t *sections = ((int32_t *) tensor->op_params) + 11; + if (tensor->op == GGML_OP_ROPE) { + tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } else { + tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } + } else { + if (tensor->op == GGML_OP_ROPE) { + tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } else { + tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } + } + } else if (tensor->op == GGML_OP_UNARY) { + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_EXP: + tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_SILU: + tensor_clone = ggml_silu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_GELU: + tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_GELU_ERF: + tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_GELU_QUICK: + tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_RELU: + tensor_clone = ggml_relu(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_TANH: + tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_SIGMOID: + tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_HARDSIGMOID: + tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]); + break; + case GGML_UNARY_OP_HARDSWISH: + tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]); + break; + default: + std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; + GGML_ABORT("fatal error"); + } + } else if (tensor->op == GGML_OP_GLU) { + if (src_clone[1] == nullptr) { + tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]); + } else { + tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); + } + ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2)); + ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3)); + } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { + if (tensor->src[1] == nullptr) { + tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); + tensor_clone->type = tensor->type; } else { - tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]); } + } else if (tensor->op == GGML_OP_CONT) { + tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + } else if (tensor->op == GGML_OP_RESHAPE) { + tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + } else if (tensor->op == GGML_OP_VIEW) { + tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); + } else if (tensor->op == GGML_OP_PERMUTE) { + int32_t * params = (int32_t *)tensor->op_params; + tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]); + } else if (tensor->op == GGML_OP_TRANSPOSE) { + tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_GET_ROWS) { + tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ARGSORT) { + tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params); + } else if (tensor->op == GGML_OP_SUM) { + tensor_clone = ggml_sum(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_SUM_ROWS) { + tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_MEAN) { + tensor_clone = ggml_mean(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_ARGMAX) { + tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_COUNT_EQUAL) { + tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_IM2COL) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + + const bool is_2D = tensor->op_params[6] == 1; + tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type); + } else if (tensor->op == GGML_OP_IM2COL_3D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t s2 = tensor->op_params[2]; + const int32_t p0 = tensor->op_params[3]; + const int32_t p1 = tensor->op_params[4]; + const int32_t p2 = tensor->op_params[5]; + const int32_t d0 = tensor->op_params[6]; + const int32_t d1 = tensor->op_params[7]; + const int32_t d2 = tensor->op_params[8]; + const int32_t IC = tensor->op_params[9]; + + tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); + } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { + const int32_t dim = tensor->op_params[0]; + const int32_t max_period = tensor->op_params[1]; + tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period); + } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){ + const int32_t s0 = tensor->op_params[0]; + const int32_t p0 = tensor->op_params[1]; + const int32_t d0 = tensor->op_params[2]; + tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); + } else if (tensor->op == GGML_OP_POOL_2D) { + enum ggml_op_pool op = static_cast(tensor->op_params[0]); + const int32_t k0 = tensor->op_params[1]; + const int32_t k1 = tensor->op_params[2]; + const int32_t s0 = tensor->op_params[3]; + const int32_t s1 = tensor->op_params[4]; + const int32_t p0 = tensor->op_params[5]; + const int32_t p1 = tensor->op_params[6]; + + tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1); + } else if (tensor->op == GGML_OP_CONV_2D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); + } else if (tensor->op == GGML_OP_CONV_2D_DW) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + const int32_t p1 = tensor->op_params[3]; + const int32_t d0 = tensor->op_params[4]; + const int32_t d1 = tensor->op_params[5]; + tensor_clone = ggml_conv_2d_dw_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); + } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) { + const int32_t s = tensor->op_params[0]; + tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s); + } else if (tensor->op == GGML_OP_LEAKY_RELU) { + const float * op_params = (const float *)tensor->op_params; + tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); + } else if (tensor->op == GGML_OP_RWKV_WKV6) { + tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2], src_clone[3], src_clone[4], src_clone[5]); + } else if (tensor->op == GGML_OP_RWKV_WKV7) { + tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], + src_clone[4], src_clone[5], src_clone[6]); + } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) { + src_clone[0]->flags = tensor->src[0]->flags; + tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2], src_clone[3], src_clone[4]); + } else if (tensor->op == GGML_OP_OPT_STEP_SGD) { + src_clone[0]->flags = tensor->src[0]->flags; + tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1], + src_clone[2]); + } else if (tensor->op == GGML_OP_ADD_ID) { + tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); + } else if (tensor->op == GGML_OP_SSM_SCAN) { + tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], + src_clone[3], src_clone[4], src_clone[5], src_clone[6]); + } else if (tensor->op == GGML_OP_SSM_CONV) { + tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_ROLL) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t s2 = tensor->op_params[2]; + const int32_t s3 = tensor->op_params[3]; + tensor_clone = ggml_roll(ggml_ctx, src_clone[0], s0, s1, s2, s3); } - } else if (tensor->op == GGML_OP_UNARY) { - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_EXP: - tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_SILU: - tensor_clone = ggml_silu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_GELU: - tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_GELU_ERF: - tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_GELU_QUICK: - tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_RELU: - tensor_clone = ggml_relu(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_TANH: - tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_SIGMOID: - tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_HARDSIGMOID: - tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]); - break; - case GGML_UNARY_OP_HARDSWISH: - tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]); - break; - default: + else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ABORT("fatal error"); } - } else if (tensor->op == GGML_OP_GLU) { - if (src_clone[1] == nullptr) { - tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]); - } else { - tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); - } - ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2)); - ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3)); - } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { - if (src1 == nullptr) { - tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); - tensor_clone->type = tensor->type; - } else { - tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]); - } - } else if (tensor->op == GGML_OP_CONT) { - tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - } else if (tensor->op == GGML_OP_RESHAPE) { - tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - } else if (tensor->op == GGML_OP_VIEW) { - tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); - } else if (tensor->op == GGML_OP_PERMUTE) { - int32_t * params = (int32_t *)tensor->op_params; - tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]); - } else if (tensor->op == GGML_OP_TRANSPOSE) { - tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_GET_ROWS) { - tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_ARGSORT) { - tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params); - } else if (tensor->op == GGML_OP_SUM) { - tensor_clone = ggml_sum(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_SUM_ROWS) { - tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_MEAN) { - tensor_clone = ggml_mean(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_ARGMAX) { - tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]); - } else if (tensor->op == GGML_OP_COUNT_EQUAL) { - tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]); - } else if (tensor->op == GGML_OP_IM2COL) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t p0 = tensor->op_params[2]; - const int32_t p1 = tensor->op_params[3]; - const int32_t d0 = tensor->op_params[4]; - const int32_t d1 = tensor->op_params[5]; - - const bool is_2D = tensor->op_params[6] == 1; - tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type); - } else if (tensor->op == GGML_OP_IM2COL_3D) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t s2 = tensor->op_params[2]; - const int32_t p0 = tensor->op_params[3]; - const int32_t p1 = tensor->op_params[4]; - const int32_t p2 = tensor->op_params[5]; - const int32_t d0 = tensor->op_params[6]; - const int32_t d1 = tensor->op_params[7]; - const int32_t d2 = tensor->op_params[8]; - const int32_t IC = tensor->op_params[9]; - - tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); - } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { - const int32_t dim = tensor->op_params[0]; - const int32_t max_period = tensor->op_params[1]; - tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period); - } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){ - const int32_t s0 = tensor->op_params[0]; - const int32_t p0 = tensor->op_params[1]; - const int32_t d0 = tensor->op_params[2]; - tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); - } else if (tensor->op == GGML_OP_POOL_2D) { - enum ggml_op_pool op = static_cast(tensor->op_params[0]); - const int32_t k0 = tensor->op_params[1]; - const int32_t k1 = tensor->op_params[2]; - const int32_t s0 = tensor->op_params[3]; - const int32_t s1 = tensor->op_params[4]; - const int32_t p0 = tensor->op_params[5]; - const int32_t p1 = tensor->op_params[6]; - - tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1); - } else if (tensor->op == GGML_OP_CONV_2D) { - const int32_t s0 = tensor->op_params[0]; - const int32_t s1 = tensor->op_params[1]; - const int32_t p0 = tensor->op_params[2]; - const int32_t p1 = tensor->op_params[3]; - const int32_t d0 = tensor->op_params[4]; - const int32_t d1 = tensor->op_params[5]; - tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); - } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) { - const int32_t s = tensor->op_params[0]; - tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s); - } else if (tensor->op == GGML_OP_LEAKY_RELU) { - const float * op_params = (const float *)tensor->op_params; - tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false); - } else if (tensor->op == GGML_OP_RWKV_WKV6) { - tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2], src_clone[3], src_clone[4], src_clone[5]); - } else if (tensor->op == GGML_OP_RWKV_WKV7) { - tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], - src_clone[4], src_clone[5], src_clone[6]); - } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) { - src_clone[0]->flags = src0->flags; - tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2], src_clone[3], src_clone[4]); - } else if (tensor->op == GGML_OP_OPT_STEP_SGD) { - src_clone[0]->flags = src0->flags; - tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1], - src_clone[2]); - } else if (tensor->op == GGML_OP_ADD_ID) { - tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]); - } else if (tensor->op == GGML_OP_SSM_SCAN) { - tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], - src_clone[3], src_clone[4], src_clone[5], src_clone[6]); - } else if (tensor->op == GGML_OP_SSM_CONV) { - tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]); - } - else { - std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ABORT("fatal error"); + cloned_tensors[tensor] = tensor_clone; } ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx); @@ -14608,10 +14607,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * memcpy(comp_result, tensor_clone->data, comp_size); memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS); - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (src_buffer[i] != nullptr) { - free(src_buffer[i]); - } + for (auto m : cloned_mallocs) { + free(m); } ggml_free(ggml_ctx); @@ -14620,15 +14617,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) { - ggml_tensor * tensor = cgraph->nodes[tensor_idx]; + ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops]; if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) { return; } - if (ctx->num_additional_fused_ops == 1 && - tensor->op == GGML_OP_RMS_NORM && - cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) { - tensor = cgraph->nodes[tensor_idx + 1]; - } if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { return;