diff --git a/src/layer/x86/convolution_1x1_int8.h b/src/layer/x86/convolution_1x1_int8.h new file mode 100644 index 00000000000..672ea860689 --- /dev/null +++ b/src/layer/x86/convolution_1x1_int8.h @@ -0,0 +1,198 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + int q = 0; + + for (; q+7 0; remain--) + { + //ToDo Neon + int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; + + *outptr0 += sum0; + + r0++; + r1++; + r2++; + r3++; + r4++; + r5++; + r6++; + r7++; + outptr0++; + } + } + + for (; q 0; remain--) + { + int sum0 = (int)(*r0) * (int)k0; + + *outptr0 += sum0; + + r0++; + outptr0++; + } + } + } +} + +static void conv1x1s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2*outw + w; + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + int q = 0; + + for (; q+7 0; remain--) + { + //ToDo Neon + int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; + + *outptr0 += sum0; + + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + r4 += 2; + r5 += 2; + r6 += 2; + r7 += 2; + outptr0++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + r4 += tailstep; + r5 += tailstep; + r6 += tailstep; + r7 += tailstep; + } + } + + for (; q 0; remain--) + { + //ToDo Neon + int sum0 = (int)*r0 * (int)kernel0[0]; + + *outptr0 += sum0; + + r0 += 2; + outptr0++; + } + + r0 += tailstep; + } + } + } +} diff --git a/src/layer/x86/convolution_3x3_int8.h b/src/layer/x86/convolution_3x3_int8.h new file mode 100644 index 00000000000..c3dae1f60ac --- /dev/null +++ b/src/layer/x86/convolution_3x3_int8.h @@ -0,0 +1,149 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * inch * 9; + + for (int q = 0; q < inch; q++) + { + int *outptr0 = out0; + + const signed char *img0 = bottom_blob.channel(q); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + for (int i = 0; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + int sum0 = 0; + + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; + + *outptr0 += sum0; + + r0++; + r1++; + r2++; + outptr0++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + + kernel0 += 9; + } + } +} + +static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * inch * 9; + + for (int q = 0; q < inch; q++) + { + int *outptr0 = out0; + + const signed char *img0 = bottom_blob.channel(q); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + for (int i = 0; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + int sum0 = 0; + + sum0 += (int)r0[0] * (int)kernel0[0]; + sum0 += (int)r0[1] * (int)kernel0[1]; + sum0 += (int)r0[2] * (int)kernel0[2]; + sum0 += (int)r1[0] * (int)kernel0[3]; + sum0 += (int)r1[1] * (int)kernel0[4]; + sum0 += (int)r1[2] * (int)kernel0[5]; + sum0 += (int)r2[0] * (int)kernel0[6]; + sum0 += (int)r2[1] * (int)kernel0[7]; + sum0 += (int)r2[2] * (int)kernel0[8]; + + *outptr0 += sum0; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr0++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + + kernel0 += 9; + } + } +} diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index e29face871c..68e8ae1ffce 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -20,6 +20,9 @@ namespace ncnn { #include "convolution_3x3.h" #include "convolution_5x5.h" +#include "convolution_1x1_int8.h" +#include "convolution_3x3_int8.h" + DEFINE_LAYER_CREATOR(Convolution_x86) int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const @@ -142,12 +145,6 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option // convolv with NxN kernel // value = value + bias - if (use_int8_inference) - { - // TODO - return Convolution::forward(bottom_blob, top_blob, opt); - } - if (bottom_blob.dims != 3) { return Convolution::forward(bottom_blob, top_blob, opt); @@ -208,18 +205,75 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } // kernel_size = 5 }; - conv_func conv = conv_func_table[kernel_size-1][stride-1]; - if (!conv) + typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&); + + // kernel_size x stride + conv_int8_func conv_int8_func_table[5][5] = { - return Convolution::forward(bottom_blob, top_blob, opt); + { + conv1x1s1_int8_sse, + conv1x1s2_int8_sse, + 0, + 0, + 0 + }, // kernel_size = 1 + { + 0, + 0, + 0, + 0, + 0 + }, // kernel_size = 2 + { + conv3x3s1_int8_sse, + conv3x3s2_int8_sse, + 0, + 0, + 0 + }, // kernel_size = 3 + { + 0, + 0, + 0, + 0, + 0 + }, // kernel_size = 4 + { + 0, + 0, + 0, + 0, + 0 + } // kernel_size = 5 + }; + + conv_func conv = 0; + conv_int8_func conv_int8 = 0; + + if (use_int8_inference) + { + conv_int8 = conv_int8_func_table[kernel_size-1][stride-1]; + if (!conv_int8) + { + return Convolution::forward(bottom_blob, top_blob, opt); + } } + else + { + conv = conv_func_table[kernel_size-1][stride-1]; + if (!conv) + { + return Convolution::forward(bottom_blob, top_blob, opt); + } - if (dilation_w != 1) { - return forwardDilation(bottom_blob, top_blob, conv, opt); + if (dilation_w != 1) { + return forwardDilation(bottom_blob, top_blob, conv, opt); + } } int w = bottom_blob.w; int h = bottom_blob.h; + int channels = bottom_blob.c; size_t elemsize = bottom_blob.elemsize; Mat bottom_blob_bordered = bottom_blob; @@ -254,6 +308,50 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option if (top_blob.empty()) return -100; + if (use_int8_inference) + { + Mat bottom_blob_bordered_int8; + bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); + if (bottom_blob_bordered_int8.empty()) + return -100; + + float bottom_scale = opt.int8_scales[0]; +// fprintf(stderr, "bottom_scale = %f\n", bottom_scale); + + // quantize, scale and round to nearest + { + ncnn::ParamDict pd; + pd.set(0, bottom_scale);// scale + + quantize->load_param(pd); + + quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt); + } + + conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt); + + // dequantize, reverse scale inplace + { + float top_rescale = 1.f / (bottom_scale * weight_data_int8_scale); + + ncnn::ParamDict pd; + pd.set(0, top_rescale);// scale + pd.set(1, bias_term);// bias_term + pd.set(2, num_output);// bias_data_size + + dequantize->load_param(pd); + + ncnn::Mat weights[1]; + weights[0] = bias_data; + + dequantize->load_model(ModelBinFromMatArray(weights)); + + dequantize->forward_inplace(top_blob, opt); + } + + return 0; + } + conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); return 0; diff --git a/src/layer/x86/convolutiondepthwise_3x3_int8.h b/src/layer/x86/convolutiondepthwise_3x3_int8.h new file mode 100644 index 00000000000..61d69846b81 --- /dev/null +++ b/src/layer/x86/convolutiondepthwise_3x3_int8.h @@ -0,0 +1,142 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + //int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out = top_blob.channel(p); + + out.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * 9; + + int *outptr = out; + + const signed char *img0 = bottom_blob.channel(p); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + int i = 0; + for (; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + + int sum = 0; + + sum += (int)r0[0] * (int)kernel0[0]; + sum += (int)r0[1] * (int)kernel0[1]; + sum += (int)r0[2] * (int)kernel0[2]; + sum += (int)r1[0] * (int)kernel0[3]; + sum += (int)r1[1] * (int)kernel0[4]; + sum += (int)r1[2] * (int)kernel0[5]; + sum += (int)r2[0] * (int)kernel0[6]; + sum += (int)r2[1] * (int)kernel0[7]; + sum += (int)r2[2] * (int)kernel0[8]; + + *outptr += sum; + + r0++; + r1++; + r2++; + outptr++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } +} + +static void convdw3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + //int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out = top_blob.channel(p); + out.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * 9; + + int *outptr = out; + + const signed char *img0 = bottom_blob.channel(p); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + int i = 0; + + for (; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + int sum = 0; + + sum += (int)r0[0] * (int)kernel0[0]; + sum += (int)r0[1] * (int)kernel0[1]; + sum += (int)r0[2] * (int)kernel0[2]; + sum += (int)r1[0] * (int)kernel0[3]; + sum += (int)r1[1] * (int)kernel0[4]; + sum += (int)r1[2] * (int)kernel0[5]; + sum += (int)r2[0] * (int)kernel0[6]; + sum += (int)r2[1] * (int)kernel0[7]; + sum += (int)r2[2] * (int)kernel0[8]; + + *outptr += sum; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index e5368f39a3f..aaae9cb59f6 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -24,6 +24,8 @@ namespace ncnn { #include "convolutiondepthwise_3x3.h" +#include "convolutiondepthwise_3x3_int8.h" + DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86) int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const @@ -31,12 +33,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con // convolv with NxN kernel // value = value + bias - if (use_int8_inference) - { - // TODO - return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt); - } - int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; @@ -88,17 +84,76 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con // depth-wise if (channels == group && group == num_output) { - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) + if (use_int8_inference) { - if (stride_w == 1 && stride_h == 1) + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { - convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); - return 0; + if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) + { + Mat bottom_blob_bordered_int8; + bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); + if (bottom_blob_bordered_int8.empty()) + return -100; + + float bottom_scale = opt.int8_scales[0]; +// fprintf(stderr, "bottom_scale = %f\n", bottom_scale); + + // quantize, scale and round to nearest + { + ncnn::ParamDict pd; + pd.set(0, bottom_scale);// scale + + quantize->load_param(pd); + + quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt); + } + + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_int8_sse(bottom_blob_bordered_int8, top_blob, weight_data, opt); + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_int8_sse(bottom_blob_bordered_int8, top_blob, weight_data, opt); + } + + // dequantize, reverse scale inplace + { + float top_rescale = 1.f / (bottom_scale * weight_data_int8_scale); + + ncnn::ParamDict pd; + pd.set(0, top_rescale);// scale + pd.set(1, bias_term);// bias_term + pd.set(2, num_output);// bias_data_size + + dequantize->load_param(pd); + + ncnn::Mat weights[1]; + weights[0] = bias_data; + + dequantize->load_model(ModelBinFromMatArray(weights)); + + dequantize->forward_inplace(top_blob, opt); + } + + return 0; + } } - else if (stride_w == 2 && stride_h == 2) + } + else + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { - convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); - return 0; + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + return 0; + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + return 0; + } } } @@ -112,7 +167,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con { Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g)); Mat top_blob_g(outw, outh, 1, top_blob.channel(g)); - Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g)); + Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize); Mat bias_data_g; if (bias_term) bias_data_g = Mat(1, (void*)((const float*)bias_data + g)); @@ -133,6 +188,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con pd.set(14, 0);// pad_h pd.set(5, bias_term); pd.set(6, maxk);// weight_data_size + pd.set(8, weight_data_int8_scale); op->load_param(pd); @@ -162,7 +218,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con { Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g)); Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g)); - Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g)); + Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize); Mat bias_data_g; if (bias_term) bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); @@ -183,6 +239,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con pd.set(14, 0);// pad_h pd.set(5, bias_term); pd.set(6, maxk * channels_g * num_output_g);// weight_data_size + pd.set(8, weight_data_int8_scale); op->load_param(pd);