diff --git a/src/layer/x86/convolution_1x1_int8.h b/src/layer/x86/convolution_1x1_int8.h
new file mode 100644
index 00000000000..672ea860689
--- /dev/null
+++ b/src/layer/x86/convolution_1x1_int8.h
@@ -0,0 +1,198 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        int q = 0;
+
+        for (; q+7<inch; q+=8)
+        {
+            int* outptr0 = out0;
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+
+            const signed char *r0 = bottom_blob.channel(q);
+            const signed char *r1 = bottom_blob.channel(q + 1);
+            const signed char *r2 = bottom_blob.channel(q + 2);
+            const signed char *r3 = bottom_blob.channel(q + 3);
+            const signed char *r4 = bottom_blob.channel(q + 4);
+            const signed char *r5 = bottom_blob.channel(q + 5);
+            const signed char *r6 = bottom_blob.channel(q + 6);
+            const signed char *r7 = bottom_blob.channel(q + 7);
+
+            int size = outw * outh;
+            int remain = size;
+
+            for (; remain > 0; remain--)
+            {
+                //ToDo Neon
+                int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
+                           (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
+                           (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
+                           (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
+
+                *outptr0 += sum0;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                r4++;
+                r5++;
+                r6++;
+                r7++;
+                outptr0++;
+            }
+        }
+
+        for (; q<inch; q++)
+        {
+            int* outptr0 = out0;
+
+            const signed char *r0 = bottom_blob.channel(q);
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+            const signed char k0 = kernel0[0];
+
+            int size = outw * outh;
+            int remain = size;
+
+            for (; remain > 0; remain--)
+            {
+                int sum0 = (int)(*r0) * (int)k0;
+
+                *outptr0 += sum0;
+
+                r0++;
+                outptr0++;
+            }
+        }
+    }
+}
+
+static void conv1x1s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        int q = 0;
+
+        for (; q+7<inch; q+=8)
+        {
+            int* outptr0 = out0;
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+
+            const signed char *r0 = bottom_blob.channel(q);
+            const signed char *r1 = bottom_blob.channel(q + 1);
+            const signed char *r2 = bottom_blob.channel(q + 2);
+            const signed char *r3 = bottom_blob.channel(q + 3);
+            const signed char *r4 = bottom_blob.channel(q + 4);
+            const signed char *r5 = bottom_blob.channel(q + 5);
+            const signed char *r6 = bottom_blob.channel(q + 6);
+            const signed char *r7 = bottom_blob.channel(q + 7);
+
+            for(int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    //ToDo Neon
+                    int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
+                            (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
+                            (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
+                            (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
+
+                    *outptr0 += sum0;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    r4 += 2;
+                    r5 += 2;
+                    r6 += 2;
+                    r7 += 2;
+                    outptr0++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+                r4 += tailstep;
+                r5 += tailstep;
+                r6 += tailstep;
+                r7 += tailstep;
+            }
+        }
+
+        for (; q<inch; q++)
+        {
+            int* outptr0 = out0;
+
+            const signed char *r0 = bottom_blob.channel(q);
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+
+            for(int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    //ToDo Neon
+                    int sum0 = (int)*r0 * (int)kernel0[0];
+
+                    *outptr0 += sum0;
+
+                    r0 += 2;
+                    outptr0++;
+                }
+
+                r0 += tailstep;
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/convolution_3x3_int8.h b/src/layer/x86/convolution_3x3_int8.h
new file mode 100644
index 00000000000..c3dae1f60ac
--- /dev/null
+++ b/src/layer/x86/convolution_3x3_int8.h
@@ -0,0 +1,149 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;
+
+        for (int q = 0; q < inch; q++)
+        {
+            int *outptr0 = out0;
+
+            const signed char *img0 = bottom_blob.channel(q);
+
+            const signed char *r0 = img0;
+            const signed char *r1 = img0 + w;
+            const signed char *r2 = img0 + w * 2;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    int sum0 = 0;
+
+                    sum0 += (int)r0[0] * kernel0[0];
+                    sum0 += (int)r0[1] * kernel0[1];
+                    sum0 += (int)r0[2] * kernel0[2];
+                    sum0 += (int)r1[0] * kernel0[3];
+                    sum0 += (int)r1[1] * kernel0[4];
+                    sum0 += (int)r1[2] * kernel0[5];
+                    sum0 += (int)r2[0] * kernel0[6];
+                    sum0 += (int)r2[1] * kernel0[7];
+                    sum0 += (int)r2[2] * kernel0[8];
+
+                    *outptr0 += sum0;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    outptr0++;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+            kernel0 += 9;
+        }
+    }
+}
+
+static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;
+
+        for (int q = 0; q < inch; q++)
+        {
+            int *outptr0 = out0;
+
+            const signed char *img0 = bottom_blob.channel(q);
+
+            const signed char *r0 = img0;
+            const signed char *r1 = img0 + w;
+            const signed char *r2 = img0 + w * 2;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    int sum0 = 0;
+
+                    sum0 += (int)r0[0] * (int)kernel0[0];
+                    sum0 += (int)r0[1] * (int)kernel0[1];
+                    sum0 += (int)r0[2] * (int)kernel0[2];
+                    sum0 += (int)r1[0] * (int)kernel0[3];
+                    sum0 += (int)r1[1] * (int)kernel0[4];
+                    sum0 += (int)r1[2] * (int)kernel0[5];
+                    sum0 += (int)r2[0] * (int)kernel0[6];
+                    sum0 += (int)r2[1] * (int)kernel0[7];
+                    sum0 += (int)r2[2] * (int)kernel0[8];
+
+                    *outptr0 += sum0;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    outptr0++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
+
+            kernel0 += 9;
+        }
+    }
+}
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index e29face871c..68e8ae1ffce 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -20,6 +20,9 @@ namespace ncnn {
 #include "convolution_3x3.h"
 #include "convolution_5x5.h"
 
+#include "convolution_1x1_int8.h"
+#include "convolution_3x3_int8.h"
+
 DEFINE_LAYER_CREATOR(Convolution_x86)
 
 int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
@@ -142,12 +145,6 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     // convolv with NxN kernel
     // value = value + bias
 
-    if (use_int8_inference)
-    {
-        // TODO
-        return Convolution::forward(bottom_blob, top_blob, opt);
-    }
-
     if (bottom_blob.dims != 3)
     {
         return Convolution::forward(bottom_blob, top_blob, opt);
@@ -208,18 +205,75 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }  // kernel_size = 5
     };
 
-    conv_func conv = conv_func_table[kernel_size-1][stride-1];
-    if (!conv)
+    typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);
+
+    // kernel_size x stride
+    conv_int8_func conv_int8_func_table[5][5] =
     {
-        return Convolution::forward(bottom_blob, top_blob, opt);
+        {
+            conv1x1s1_int8_sse,
+            conv1x1s2_int8_sse,
+            0,
+            0,
+            0
+        }, // kernel_size = 1
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 2
+        {
+            conv3x3s1_int8_sse,
+            conv3x3s2_int8_sse,
+            0,
+            0,
+            0
+        }, // kernel_size = 3
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 4
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }  // kernel_size = 5
+    };
+
+    conv_func conv = 0;
+    conv_int8_func conv_int8 = 0;
+
+    if (use_int8_inference)
+    {
+        conv_int8 = conv_int8_func_table[kernel_size-1][stride-1];
+        if (!conv_int8)
+        {
+            return Convolution::forward(bottom_blob, top_blob, opt);
+        }
     }
+    else
+    {
+        conv = conv_func_table[kernel_size-1][stride-1];
+        if (!conv)
+        {
+            return Convolution::forward(bottom_blob, top_blob, opt);
+        }
 
-    if (dilation_w != 1) {
-        return forwardDilation(bottom_blob, top_blob, conv, opt);
+        if (dilation_w != 1) {
+            return forwardDilation(bottom_blob, top_blob, conv, opt);
+        }
     }
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    int channels = bottom_blob.c;
     size_t elemsize = bottom_blob.elemsize;
 
     Mat bottom_blob_bordered = bottom_blob;
@@ -254,6 +308,50 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     if (top_blob.empty())
         return -100;
 
+    if (use_int8_inference)
+    {
+        Mat bottom_blob_bordered_int8;
+        bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
+        if (bottom_blob_bordered_int8.empty())
+            return -100;
+
+        float bottom_scale = opt.int8_scales[0];
+//         fprintf(stderr, "bottom_scale = %f\n", bottom_scale);
+
+        // quantize, scale and round to nearest
+        {
+            ncnn::ParamDict pd;
+            pd.set(0, bottom_scale);// scale
+
+            quantize->load_param(pd);
+
+            quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
+        }
+
+        conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt);
+
+        // dequantize, reverse scale inplace
+        {
+            float top_rescale = 1.f / (bottom_scale * weight_data_int8_scale);
+
+            ncnn::ParamDict pd;
+            pd.set(0, top_rescale);// scale
+            pd.set(1, bias_term);// bias_term
+            pd.set(2, num_output);// bias_data_size
+
+            dequantize->load_param(pd);
+
+            ncnn::Mat weights[1];
+            weights[0] = bias_data;
+
+            dequantize->load_model(ModelBinFromMatArray(weights));
+
+            dequantize->forward_inplace(top_blob, opt);
+        }
+
+        return 0;
+    }
+
     conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
 
     return 0;
diff --git a/src/layer/x86/convolutiondepthwise_3x3_int8.h b/src/layer/x86/convolutiondepthwise_3x3_int8.h
new file mode 100644
index 00000000000..61d69846b81
--- /dev/null
+++ b/src/layer/x86/convolutiondepthwise_3x3_int8.h
@@ -0,0 +1,142 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    //int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        out.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * 9;
+
+        int *outptr = out;
+
+        const signed char *img0 = bottom_blob.channel(p);
+
+        const signed char *r0 = img0;
+        const signed char *r1 = img0 + w;
+        const signed char *r2 = img0 + w * 2;
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int remain = outw;
+
+            for (; remain > 0; remain--)
+            {
+
+                int sum = 0;
+
+                sum += (int)r0[0] * (int)kernel0[0];
+                sum += (int)r0[1] * (int)kernel0[1];
+                sum += (int)r0[2] * (int)kernel0[2];
+                sum += (int)r1[0] * (int)kernel0[3];
+                sum += (int)r1[1] * (int)kernel0[4];
+                sum += (int)r1[2] * (int)kernel0[5];
+                sum += (int)r2[0] * (int)kernel0[6];
+                sum += (int)r2[1] * (int)kernel0[7];
+                sum += (int)r2[2] * (int)kernel0[8];
+
+                *outptr += sum;
+
+                r0++;
+                r1++;
+                r2++;
+                outptr++;
+            }
+
+            r0 += 2;
+            r1 += 2;
+            r2 += 2;
+        }
+    }
+}
+
+static void convdw3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    //int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+        out.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * 9;
+
+        int *outptr = out;
+
+        const signed char *img0 = bottom_blob.channel(p);
+
+        const signed char *r0 = img0;
+        const signed char *r1 = img0 + w;
+        const signed char *r2 = img0 + w * 2;
+
+        int i = 0;
+
+        for (; i < outh; i++)
+        {
+            int remain = outw;
+
+            for (; remain > 0; remain--)
+            {
+                int sum = 0;
+
+                sum += (int)r0[0] * (int)kernel0[0];
+                sum += (int)r0[1] * (int)kernel0[1];
+                sum += (int)r0[2] * (int)kernel0[2];
+                sum += (int)r1[0] * (int)kernel0[3];
+                sum += (int)r1[1] * (int)kernel0[4];
+                sum += (int)r1[2] * (int)kernel0[5];
+                sum += (int)r2[0] * (int)kernel0[6];
+                sum += (int)r2[1] * (int)kernel0[7];
+                sum += (int)r2[2] * (int)kernel0[8];
+
+                *outptr += sum;
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr++;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index e5368f39a3f..aaae9cb59f6 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -24,6 +24,8 @@ namespace ncnn {
 
 #include "convolutiondepthwise_3x3.h"
 
+#include "convolutiondepthwise_3x3_int8.h"
+
 DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)
 
 int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
@@ -31,12 +33,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
     // convolv with NxN kernel
     // value = value + bias
 
-    if (use_int8_inference)
-    {
-        // TODO
-        return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt);
-    }
-
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
@@ -88,17 +84,76 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
     // depth-wise
     if (channels == group && group == num_output)
     {
-        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
+        if (use_int8_inference)
         {
-            if (stride_w == 1 && stride_h == 1)
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
             {
-                convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
-                return 0;
+                if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
+                {
+                    Mat bottom_blob_bordered_int8;
+                    bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
+                    if (bottom_blob_bordered_int8.empty())
+                        return -100;
+
+                    float bottom_scale = opt.int8_scales[0];
+//                     fprintf(stderr, "bottom_scale = %f\n", bottom_scale);
+
+                    // quantize, scale and round to nearest
+                    {
+                        ncnn::ParamDict pd;
+                        pd.set(0, bottom_scale);// scale
+
+                        quantize->load_param(pd);
+
+                        quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
+                    }
+
+                    if (stride_w == 1 && stride_h == 1)
+                    {
+                        convdw3x3s1_int8_sse(bottom_blob_bordered_int8, top_blob, weight_data, opt);
+                    }
+                    else if (stride_w == 2 && stride_h == 2)
+                    {
+                        convdw3x3s2_int8_sse(bottom_blob_bordered_int8, top_blob, weight_data, opt);
+                    }
+
+                    // dequantize, reverse scale inplace
+                    {
+                        float top_rescale = 1.f / (bottom_scale * weight_data_int8_scale);
+
+                        ncnn::ParamDict pd;
+                        pd.set(0, top_rescale);// scale
+                        pd.set(1, bias_term);// bias_term
+                        pd.set(2, num_output);// bias_data_size
+
+                        dequantize->load_param(pd);
+
+                        ncnn::Mat weights[1];
+                        weights[0] = bias_data;
+
+                        dequantize->load_model(ModelBinFromMatArray(weights));
+
+                        dequantize->forward_inplace(top_blob, opt);
+                    }
+
+                    return 0;
+                }
             }
-            else if (stride_w == 2 && stride_h == 2)
+        }
+        else
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
             {
-                convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
-                return 0;
+                if (stride_w == 1 && stride_h == 1)
+                {
+                    convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+                    return 0;
+                }
+                else if (stride_w == 2 && stride_h == 2)
+                {
+                    convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+                    return 0;
+                }
             }
         }
 
@@ -112,7 +167,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
         {
             Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
             Mat top_blob_g(outw, outh, 1, top_blob.channel(g));
-            Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
+            Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize);
             Mat bias_data_g;
             if (bias_term)
                 bias_data_g = Mat(1, (void*)((const float*)bias_data + g));
@@ -133,6 +188,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
             pd.set(14, 0);// pad_h
             pd.set(5, bias_term);
             pd.set(6, maxk);// weight_data_size
+            pd.set(8, weight_data_int8_scale);
 
             op->load_param(pd);
 
@@ -162,7 +218,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
     {
         Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
         Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g));
-        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
+        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
         Mat bias_data_g;
         if (bias_term)
             bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
@@ -183,6 +239,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
         pd.set(14, 0);// pad_h
         pd.set(5, bias_term);
         pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
+        pd.set(8, weight_data_int8_scale);
 
         op->load_param(pd);