Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed gelu, added layernorm. Added timvx version gelu and layernorm #1415

Merged
merged 4 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 221 additions & 0 deletions source/device/cpu/op/layernorm/layernorm_ref.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Copyright (c) 2021, OPEN AI LAB
* Author: Shijie Chen
*/

#include "layernorm_param.h"

#include "graph/tensor.h"
#include "graph/node.h"
#include "graph/graph.h"
#include "utility/sys_port.h"
#include "utility/float.h"
#include "utility/log.h"
#include "device/cpu/cpu_node.h"
#include "device/cpu/cpu_graph.h"
#include "device/cpu/cpu_module.h"

#include <math.h>

static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}

static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}

static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}

static int ref_layernorm_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
struct tensor* gamma_tensor, struct tensor* beta_tensor, float eps)
{
#if 1
// TIM-VX
int norm_size = input_tensor->dims[input_tensor->dim_num - 1];
int count = 1;
for (int i = 0; i < input_tensor->dim_num - 1; i++)
{
count *= input_tensor->dims[i];
}
#else
// PyTorch
int norm_size = gamma_tensor->elem_num;
int count = input_tensor->elem_num / gamma_tensor->elem_num;
#endif

const float* input_data = (const float*)input_tensor->data;
float* output_data = (float*)output_tensor->data;

const float* gamma_data = (const float*)gamma_tensor->data;
const float* beta_data = (const float*)beta_tensor->data;

for (int i = 0; i < count; i++)
{
float sum = 0.f;
float sqsum = 0.f;
for (int j = 0; j < norm_size; j++)
{
float x = input_data[i * norm_size + j];
sum += x;
sqsum += x * x;
}
float mean = sum / norm_size;
float var = sqsum / norm_size - mean * mean;
float a = 1.0f / sqrtf(var + eps);
float b = -mean * a;
for (int j = 0; j < norm_size; j++)
{
int offset = i * norm_size + j;
output_data[offset] = (input_data[offset] * a + b) * gamma_data[j] + beta_data[j];
}
}

return 0;
}

static int ref_layernorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
struct tensor* gamma_tensor, struct tensor* beta_tensor, float eps)
{
#if 1
// TIM-VX
int norm_size = input_tensor->dims[input_tensor->dim_num - 1];
int count = 1;
for (int i = 0; i < input_tensor->dim_num - 1; i++)
{
count *= input_tensor->dims[i];
}
#else
// PyTorch
int norm_size = gamma_tensor->elem_num;
int count = input_tensor->elem_num / gamma_tensor->elem_num;
#endif

int total_size = input_tensor->elem_num;
float* input_data = (float*)sys_malloc(total_size * sizeof(float));
float* output_data = (float*)sys_malloc(total_size * sizeof(float));

// dequant
{
const uint8_t* input_uint8 = (const uint8_t*)input_tensor->data;
float input_scale = input_tensor->scale;
int input_zero = input_tensor->zero_point;

for (int i = 0; i < total_size; i++)
input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
}

const float* gamma_data = (const float*)gamma_tensor->data;
const float* beta_data = (const float*)beta_tensor->data;

for (int i = 0; i < count; i++)
{
float sum = 0.f;
float sqsum = 0.f;
for (int j = 0; j < norm_size; j++)
{
float x = input_data[i * norm_size + j];
sum += x;
sqsum += x * x;
}
float mean = sum / norm_size;
float var = sqsum / norm_size - mean * mean;
float a = 1.0f / sqrtf(var + eps);
float b = -mean * a;
for (int j = 0; j < norm_size; j++)
{
int offset = i * norm_size + j;
output_data[offset] = (input_data[offset] * a + b) * gamma_data[j] + beta_data[j];
}
}

// quant
{
uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
float output_scale = output_tensor->scale;
int output_zero = output_tensor->zero_point;
for (int i = 0; i < total_size; i++)
{
int udata = (int)roundf(output_data[i] / output_scale + output_zero);
if (udata > 255)
udata = 255;
else if (udata < 0)
udata = 0;
output_uint8[i] = udata;
}
}

sys_free(input_data);
sys_free(output_data);
return 0;
}

static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct node* node = exec_node->ir_node;
struct graph* graph = node->graph;

struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
struct tensor* gamma_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
struct tensor* beta_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);

struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);

struct layernorm_Param* param = (struct layernorm_Param*)node->op.param_mem;
float eps = param->eps;

int ret = -1;
if (input_tensor->data_type == TENGINE_DT_FP32)
ret = ref_layernorm_fp32(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps);
else if (input_tensor->data_type == TENGINE_DT_UINT8)
ret = ref_layernorm_uint8(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps);

return ret;
}

static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
{
return OPS_SCORE_BEST;
}

static struct node_ops hcl_node_ops = {.prerun = NULL,
.run = run,
.reshape = NULL,
.postrun = NULL,
.init_node = init_node,
.release_node = release_node,
.score = score};

int register_layernorm_ref_op()
{
return register_builtin_node_ops(OP_LAYERNORM, &hcl_node_ops);
}

int unregister_layernorm_ref_op()
{
return unregister_builtin_node_ops(OP_LAYERNORM, &hcl_node_ops);
}
47 changes: 47 additions & 0 deletions source/device/tim-vx/op/timvx_gelu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Copyright (c) 2021, Open AI Lab
* Author: Shijie Chen
*/

#include "timvx_executor.hpp"

extern "C"
{
#include "operator/op.h"
}


bool VXEngine::AddGeluNode(struct node* ir_node)
{
struct graph* ir_graph = ir_node->graph;

struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

auto gelu = graph->CreateOperation<tim::vx::ops::Gelu>();
(*gelu)
.BindInputs({ this->vx_tensor_map[input_tensor->index] })
.BindOutputs({ this->vx_tensor_map[output_tensor->index] });

return true;
}

58 changes: 58 additions & 0 deletions source/device/tim-vx/op/timvx_layernorm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Copyright (c) 2021, Open AI Lab
* Author: Shijie Chen
*/

#include "timvx_executor.hpp"

extern "C"
{
#include "operator/op.h"
#include "layernorm_param.h"
}


bool VXEngine::AddLayerNormNode(struct node* ir_node)
{
struct graph* ir_graph = ir_node->graph;

std::vector<std::shared_ptr<tim::vx::Tensor> > bn_in_tensor(ir_node->input_num);

int in_set[3] = {0, 2, 1};
for (int i = 0; i < ir_node->input_num; i++)
{
int idx = in_set[i];
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[idx]);
bn_in_tensor[i] = this->vx_tensor_map[input_tensor->index];
}
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

struct layernorm_Param* param = (struct layernorm_Param*)ir_node->op.param_mem;

auto layernorm = graph->CreateOperation<tim::vx::ops::LayerNormalization>(0, param->eps);
(*layernorm)
.BindInputs({ bn_in_tensor })
.BindOutputs({ this->vx_tensor_map[output_tensor->index] });

return true;
}

6 changes: 6 additions & 0 deletions source/device/tim-vx/timvx_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,12 @@ int VXEngine::Build(struct subgraph* subgraph)
case OP_L2NORMALIZATION:
this->AddL2normalizationNode(ir_node);
break;
case OP_GELU:
this->AddGeluNode(ir_node);
break;
case OP_LAYERNORM:
this->AddLayerNormNode(ir_node);
break;
default:
fprintf(stderr, "Tengine TIM-VX: Cannot support OP(%d).\n", ir_node->index);
break;
Expand Down
3 changes: 3 additions & 0 deletions source/device/tim-vx/timvx_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ extern "C" {
#include "tim/vx/ops/transpose.h"
#include "tim/vx/ops/spatial_transformer.h"
#include "tim/vx/ops/l2normalization.h"
#include "tim/vx/ops/layernormalization.h"

#define SPEC_TYPE_CONV 1
#define SPEC_TYPE_CONV_BIAS 2
Expand Down Expand Up @@ -145,6 +146,8 @@ class VXEngine
bool AddUpsampleNode(struct node* ir_node);
bool AddSpatialtransformerNode(struct node* ir_node);
bool AddL2normalizationNode(struct node* ir_node);
bool AddGeluNode(struct node* ir_node);
bool AddLayerNormNode(struct node* ir_node);

public:
std::shared_ptr<tim::vx::Context> context;
Expand Down
2 changes: 2 additions & 0 deletions source/device/tim-vx/timvx_limit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,7 @@ const int timvx_supported_ops[] = {
// OP_WHERE,
// OP_SOFTPLUS,
// OP_RECIPROCAL,
OP_GELU,
OP_LAYERNORM,
// OP_BUILTIN_LAST
};
1 change: 1 addition & 0 deletions source/operator/op.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ enum
OP_SPATIALTRANSFORMER,
OP_EXPAND,
OP_GELU,
OP_LAYERNORM,
OP_BUILTIN_LAST
};

Expand Down
1 change: 1 addition & 0 deletions source/operator/op_name.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,4 @@
#define OP_SPATIALTRANSFORMER_NAME "SpatialTransformer"
#define OP_EXPAND_NAME "Expand"
#define OP_GELU_NAME "Gelu"
#define OP_LAYERNORM_NAME "LayerNorm"
Loading