Skip to content

Commit

Permalink
metal : warp-based reduce for rms_norm
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Nov 30, 2023
1 parent 55717c9 commit c4db592
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 33 deletions.
18 changes: 11 additions & 7 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -1358,15 +1358,19 @@ void ggml_metal_graph_compute(
float eps;
memcpy(&eps, dst->op_params, sizeof(float));

const int nth = MIN(512, ne00);
int nth = 32; // SIMD width

while (nth < ne00/4 && nth < 1024) {
nth *= 2;
}

[encoder setComputePipelineState:ctx->pipeline_rms_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];

const int64_t nrows = ggml_nrows(src0);

Expand Down
41 changes: 15 additions & 26 deletions ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -447,14 +447,13 @@ kernel void kernel_rms_norm(
constant int64_t & ne00,
constant uint64_t & nb01,
constant float & eps,
threadgroup float * sum [[threadgroup(0)]],
threadgroup float * buf [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
device const float * x_scalar = (device const float *) x;
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);

float4 sumf = 0;
float all_sum = 0;
Expand All @@ -465,40 +464,30 @@ kernel void kernel_rms_norm(
}
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
all_sum = simd_sum(all_sum);
if (tiisg == 0) {
sum[sgitg] = all_sum;
}
if (ntg > N_SIMDWIDTH) {
if (sgitg == 0) {
buf[tiisg] = 0.0f;
}

threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup_barrier(mem_flags::mem_threadgroup);

// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
sum[tpitg] += sum[tpitg + i];
}
}
if (tpitg == 0) {
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
sum[0] += x_scalar[i];
if (tiisg == 0) {
buf[sgitg] = all_sum;
}
sum[0] /= ne00;
}

threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup_barrier(mem_flags::mem_threadgroup);

const float mean = sum[0];
all_sum = buf[tiisg];
all_sum = simd_sum(all_sum);
}

const float mean = all_sum/ne00;
const float scale = 1.0f/sqrt(mean + eps);

device float4 * y = (device float4 *) (dst + tgpig*ne00);
device float * y_scalar = (device float *) y;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
y[i00] = x[i00] * scale;
}
if (tpitg == 0) {
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
y_scalar[i00] = x_scalar[i00] * scale;
}
}
}

// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
Expand Down

0 comments on commit c4db592

Please sign in to comment.