ceccocats · thnkinbtfly · Dec 14, 2020 · Dec 14, 2020 · Dec 14, 2020 · Dec 14, 2020
diff --git a/include/tkDNN/DarknetParser.h b/include/tkDNN/DarknetParser.h
@@ -28,6 +28,7 @@ namespace tk { namespace dnn {
         int new_coords= 0;
         float scale_xy = 1;
         float nms_thresh = 0.45;
+        int scale_wh_in_scale_channels = 0;
         std::vector<int> layers;
         std::string activation = "linear";
 

diff --git a/include/tkDNN/Layer.h b/include/tkDNN/Layer.h
@@ -19,6 +19,7 @@ enum layerType_t {
     LAYER_ACTIVATION_CRELU,
     LAYER_ACTIVATION_LEAKY,
     LAYER_ACTIVATION_MISH,
+    LAYER_ACTIVATION_SWISH,
     LAYER_FLATTEN,
     LAYER_RESHAPE,
     LAYER_MULADD,
@@ -27,6 +28,7 @@ enum layerType_t {
     LAYER_ROUTE,
     LAYER_REORG,
     LAYER_SHORTCUT,
+    LAYER_SCALECHANNELS,
     LAYER_UPSAMPLE,
     LAYER_REGION,
     LAYER_YOLO
@@ -68,6 +70,7 @@ class Layer {
             case LAYER_ACTIVATION_CRELU:    return "ActivationCReLU";
             case LAYER_ACTIVATION_LEAKY:    return "ActivationLeaky";
             case LAYER_ACTIVATION_MISH:     return "ActivationMish";
+            case LAYER_ACTIVATION_SWISH:     return "ActivationSwish";
             case LAYER_FLATTEN:             return "Flatten";
             case LAYER_RESHAPE:             return "Reshape";
             case LAYER_MULADD:              return "MulAdd";
@@ -76,6 +79,7 @@ class Layer {
             case LAYER_ROUTE:               return "Route";            
             case LAYER_REORG:               return "Reorg";
             case LAYER_SHORTCUT:            return "Shortcut";
+            case LAYER_SCALECHANNELS:      return "ScaleChannels";
             case LAYER_UPSAMPLE:            return "Upsample";
             case LAYER_REGION:              return "Region";
             case LAYER_YOLO:                return "Yolo";
@@ -212,7 +216,8 @@ class Dense : public LayerWgs {
 typedef enum {
     ACTIVATION_ELU     = 100,
     ACTIVATION_LEAKY   = 101,
-    ACTIVATION_MISH   = 102
+    ACTIVATION_MISH    = 102,
+    ACTIVATION_SWISH   = 103
 } tkdnnActivationMode_t;
 
 /**
@@ -233,6 +238,8 @@ class Activation : public Layer {
             return LAYER_ACTIVATION_LEAKY;
         else if (act_mode == ACTIVATION_MISH)
             return LAYER_ACTIVATION_MISH;
+        else if (act_mode == ACTIVATION_SWISH)
+            return LAYER_ACTIVATION_SWISH;
         else
             return LAYER_ACTIVATION;
          };
@@ -557,6 +564,25 @@ class Shortcut : public Layer {
     Layer *backLayer;
 };
 
+/**
+    ScaleChannels layer
+    channelwise-multiplication with another layer
+*/
+class ScaleChannels : public Layer {
+
+public:
+    ScaleChannels(Network *net, Layer *backLayer, int scale_wh); 
+    virtual ~ScaleChannels();
+    virtual layerType_t getLayerType() { return LAYER_SCALECHANNELS; };
+
+    virtual dnnType* infer(dataDim_t &dim, dnnType* srcData);
+
+public:
+    Layer *backLayer;
+    int scale_wh;
+};
+
+
 /**
     Upsample layer
     Maintains same dimension but change C*H*W distribution

diff --git a/include/tkDNN/NetworkRT.h b/include/tkDNN/NetworkRT.h
@@ -26,6 +26,7 @@ using namespace nvinfer1;
 #include "pluginsRT/ActivationLeakyRT.h"
 #include "pluginsRT/ActivationReLUCeilingRT.h"
 #include "pluginsRT/ActivationMishRT.h"
+#include "pluginsRT/ActivationSwishRT.h"
 #include "pluginsRT/ReorgRT.h"
 #include "pluginsRT/RegionRT.h"
 #include "pluginsRT/RouteRT.h"
@@ -108,6 +109,7 @@ class NetworkRT {
     nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, Reorg *l);
     nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, Region *l);
     nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, Shortcut *l);
+    nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, ScaleChannels *l);
     nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, Yolo *l);
     nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, Upsample *l);
     nvinfer1::ILayer* convert_layer(nvinfer1::ITensor *input, DeformConv2d *l);

diff --git a/include/tkDNN/kernels.h b/include/tkDNN/kernels.h
@@ -9,6 +9,7 @@ void activationReLUCeilingForward(dnnType *srcData, dnnType *dstData, int size,
 void activationLOGISTICForward(dnnType *srcData, dnnType *dstData, int size, cudaStream_t stream = cudaStream_t(0));
 void activationSIGMOIDForward(dnnType *srcData, dnnType *dstData, int size, cudaStream_t stream = cudaStream_t(0));
 void activationMishForward(dnnType* srcData, dnnType* dstData, int size, cudaStream_t stream= cudaStream_t(0));
+void activationSwishForward(dnnType* srcData, dnnType* dstData, int size, cudaStream_t stream= cudaStream_t(0));
 
 void fill(dnnType *data, int size, dnnType val, cudaStream_t stream = cudaStream_t(0));
 
@@ -27,6 +28,10 @@ void shortcutForward(dnnType *srcData, dnnType *dstData, int n1, int c1, int h1,
                      int n2, int c2, int h2, int w2, int s2,
                      cudaStream_t stream = cudaStream_t(0));
 
+void scaleChannelsForward(dnnType *in_w_h_c, int size, int channel_size, int batch_size, int scale_wh,
+                          dnnType *scales_c, dnnType *out, 
+                          cudaStream_t stream = cudaStream_t(0));
+
 void upsampleForward(dnnType *srcData, dnnType *dstData,
                      int n, int c, int h, int w, int s, int forward, float scale,
                      cudaStream_t stream = cudaStream_t(0));

diff --git a/include/tkDNN/pluginsRT/ActivationSwishRT.h b/include/tkDNN/pluginsRT/ActivationSwishRT.h
@@ -0,0 +1,60 @@
+#include<cassert>
+#include "../kernels.h"
+
+class ActivationSwishRT : public IPlugin {
+
+public:
+	ActivationSwishRT() {
+
+
+	}
+
+	~ActivationSwishRT(){
+
+	}
+
+	int getNbOutputs() const override {
+		return 1;
+	}
+
+	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override {
+		return inputs[0];
+	}
+
+	void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override {
+		size = 1;
+		for(int i=0; i<outputDims[0].nbDims; i++)
+			size *= outputDims[0].d[i];
+	}
+
+	int initialize() override {
+
+		return 0;
+	}
+
+	virtual void terminate() override {
+	}
+
+	virtual size_t getWorkspaceSize(int maxBatchSize) const override {
+		return 0;
+	}
+
+	virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override {
+
+		activationSwishForward((dnnType*)reinterpret_cast<const dnnType*>(inputs[0]), 
+											reinterpret_cast<dnnType*>(outputs[0]), batchSize*size, stream);
+		return 0;
+	}
+
+
+	virtual size_t getSerializationSize() override {
+		return 1*sizeof(int);
+	}
+
+	virtual void serialize(void* buffer) override {
+		char *buf = reinterpret_cast<char*>(buffer);
+		tk::dnn::writeBUF(buf, size);
+	}
+
+	int size;
+};
diff --git a/src/Activation.cpp b/src/Activation.cpp
@@ -52,6 +52,9 @@ dnnType* Activation::infer(dataDim_t &dim, dnnType* srcData) {
     else if(act_mode == ACTIVATION_MISH) {
         activationMishForward(srcData, dstData, dim.tot());
 
+    }
+    else if(act_mode == ACTIVATION_SWISH) {
+        activationSwishForward(srcData, dstData, dim.tot());
     } else {
         dnnType alpha = dnnType(1);
         dnnType beta  = dnnType(0);

diff --git a/src/DarknetParser.cpp b/src/DarknetParser.cpp
@@ -80,6 +80,8 @@ namespace tk { namespace dnn {
             fields.groups = std::stoi(value);
         else if(name.find("group_id") !=  std::string::npos)
             fields.group_id = std::stoi(value);
+        else if(name.find("scale_wh") !=  std::string::npos)
+            fields.scale_wh_in_scale_channels = std::stoi(value);
         else if(name.find("scale_x_y") !=  std::string::npos)
             fields.scale_xy = std::stof(value);
         else if(name.find("beta_nms") !=  std::string::npos)
@@ -134,7 +136,11 @@ namespace tk { namespace dnn {
                     f.padding_x, f.padding_y, tk::dnn::POOLING_MAX));
 
         } else if(f.type == "avgpool") {
-            netLayers.push_back(new tk::dnn::Pooling(net, f.size_x, f.size_y, f.stride_x, f.stride_y, 
+            auto output_dim = net->getOutputDim();
+            int stride = 1;
+            assert(f.padding_x == 0 && f.padding_y == 0);
+
+            netLayers.push_back(new tk::dnn::Pooling(net, output_dim.h, output_dim.w, stride, stride,
                 f.padding_x, f.padding_y, tk::dnn::POOLING_AVERAGE));
 
         } else if(f.type == "shortcut") {
@@ -146,6 +152,18 @@ namespace tk { namespace dnn {
             //std::cout<<"shortcut to "<<layerIdx<<" "<<netLayers[layerIdx]->getLayerName()<<"\n";
             netLayers.push_back(new tk::dnn::Shortcut(net, netLayers[layerIdx]));
 
+        } else if(f.type == "scale_channels") {
+            if(f.layers.size() != 1) FatalError("no layers to scale_channels\n");
+            int layerIdx = f.layers[0];
+            if(layerIdx < 0)
+                layerIdx = netLayers.size() + layerIdx; 
+            if(layerIdx < 0 || layerIdx >= netLayers.size()) FatalError("impossible to scale_channels\n");
+
+            int scale_wh = f.scale_wh_in_scale_channels;
+            if(scale_wh != 0) FatalError("Currently only support scale_wh=0 in scale_channels\n")
+
+            netLayers.push_back(new tk::dnn::ScaleChannels(net, netLayers[layerIdx], scale_wh));
+
         } else if(f.type == "upsample") {
             netLayers.push_back(new tk::dnn::Upsample(net, f.stride_x));
 
@@ -185,8 +203,10 @@ namespace tk { namespace dnn {
         if(netLayers.size() > 0 && f.activation != "linear") {
             tkdnnActivationMode_t act;
             if(f.activation == "relu") act = tkdnnActivationMode_t(CUDNN_ACTIVATION_RELU);
+            else if(f.activation == "logistic") act = tkdnnActivationMode_t(CUDNN_ACTIVATION_SIGMOID);
             else if(f.activation == "leaky") act = tk::dnn::ACTIVATION_LEAKY;
             else if(f.activation == "mish") act = tk::dnn::ACTIVATION_MISH;
+            else if(f.activation == "swish") act = tk::dnn::ACTIVATION_SWISH;
             else { FatalError("activation not supported: " + f.activation); }
             netLayers[netLayers.size()-1] = new tk::dnn::Activation(net, act);
         };

diff --git a/src/NetworkRT.cpp b/src/NetworkRT.cpp
@@ -226,7 +226,7 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Layer *l) {
         return convert_layer(input, (Conv2d*) l);
     if(type == LAYER_POOLING)
         return convert_layer(input, (Pooling*) l);
-    if(type == LAYER_ACTIVATION || type == LAYER_ACTIVATION_CRELU || type == LAYER_ACTIVATION_LEAKY || type == LAYER_ACTIVATION_MISH)
+    if(type == LAYER_ACTIVATION || type == LAYER_ACTIVATION_CRELU || type == LAYER_ACTIVATION_LEAKY || type == LAYER_ACTIVATION_MISH || type == LAYER_ACTIVATION_SWISH)
         return convert_layer(input, (Activation*) l);
     if(type == LAYER_SOFTMAX)
         return convert_layer(input, (Softmax*) l);
@@ -242,6 +242,8 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Layer *l) {
         return convert_layer(input, (Region*) l);
     if(type == LAYER_SHORTCUT)
         return convert_layer(input, (Shortcut*) l);
+    if(type == LAYER_SCALECHANNELS)
+        return convert_layer(input, (ScaleChannels*) l);
     if(type == LAYER_YOLO)
         return convert_layer(input, (Yolo*) l);
     if(type == LAYER_UPSAMPLE)
@@ -421,6 +423,12 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Activation *l) {
         checkNULL(lRT);
         return lRT;
     }
+    else if(l->act_mode == ACTIVATION_SWISH) {
+        IPlugin *plugin = new ActivationSwishRT();
+        IPluginLayer *lRT = networkRT->addPlugin(&input, 1, *plugin);
+        checkNULL(lRT);
+        return lRT;
+    }
     else {
         FatalError("this Activation mode is not yet implemented");
         return NULL;
@@ -525,6 +533,14 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Shortcut *l) {
     }
 }
 
+ILayer* NetworkRT::convert_layer(ITensor *input, ScaleChannels *l) {
+    ITensor *back_tens = tensors[l->backLayer];
+
+    IElementWiseLayer *lRT = networkRT->addElementWise(*input, *back_tens, ElementWiseOperation::kPROD);
+    checkNULL(lRT);
+    return lRT;
+}
+
 ILayer* NetworkRT::convert_layer(ITensor *input, Yolo *l) {
     //std::cout<<"convert Yolo\n";
 
@@ -653,6 +669,11 @@ IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialDa
         a->size = readBUF<int>(buf);
         return a;
     }
+    if(name.find("ActivationSwish") == 0) {
+        ActivationSwishRT *a = new ActivationSwishRT();
+        a->size = readBUF<int>(buf);
+        return a;
+    }
     if(name.find("ActivationCReLU") == 0) {
         ActivationReLUCeiling *a = new ActivationReLUCeiling(readBUF<float>(buf));
         a->size = readBUF<int>(buf);

diff --git a/src/ScaleChannels.cpp b/src/ScaleChannels.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+
+#include "Layer.h"
+#include "kernels.h"
+
+namespace tk { namespace dnn {
+ScaleChannels::ScaleChannels(Network *net, Layer *backLayer, int scale_wh) : Layer(net) {
+
+    this->backLayer = backLayer;
+    this->scale_wh = scale_wh;
+    output_dim = backLayer->output_dim;
+    checkCuda( cudaMalloc(&dstData, output_dim.tot()*sizeof(dnnType)) );
+
+    if( backLayer->output_dim.c != input_dim.c )
+        FatalError("ScaleChannels dim missmatch");
+
+}
+
+ScaleChannels::~ScaleChannels() {
+
+    checkCuda( cudaFree(dstData) );
+}
+
+dnnType* ScaleChannels::infer(dataDim_t &dim, dnnType* srcData) {
+
+    int size = output_dim.n * output_dim.c * output_dim.h * output_dim.w;
+    int channel_size = output_dim.h * output_dim.w;
+    int batch_size = output_dim.c * output_dim.h * output_dim.w;
+    scaleChannelsForward(this->backLayer->dstData, size, channel_size, batch_size, scale_wh, srcData, dstData);
+
+    //update data dimensions    
+    dim = output_dim;
+
+    return dstData;
+}
+
+}}
diff --git a/src/kernels/activation_swish.cu b/src/kernels/activation_swish.cu
@@ -0,0 +1,27 @@
+#include "kernels.h"
+#include <math.h>
+
+// https://github.com/AlexeyAB/darknet/blob/master/src/activation_kernels.cu
+__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
+
+__global__
+void activation_swish(dnnType *input, dnnType *output, int size) {
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < size) 
+    {
+        float x_val = input[i];
+        float sigmoid = logistic_activate_kernel(x_val);
+        output[i] = x_val * sigmoid;
+    }
+}
+
+/**
+    swish activation function
+*/
+void activationSwishForward(dnnType* srcData, dnnType* dstData, int size, cudaStream_t stream)
+{
+    int blocks = (size+255)/256;
+    int threads = 256;
+
+    activation_swish<<<blocks, threads, 0, stream>>>(srcData, dstData, size);
+}
diff --git a/src/kernels/scale_channels.cu b/src/kernels/scale_channels.cu
@@ -0,0 +1,27 @@
+#include "kernels.h"
+#include "assert.h"
+
+// https://github.com/AlexeyAB/darknet/blob/master/src/blas_kernels.cu
+__global__ void scale_channels_kernel(float *in_w_h_c, int size, int channel_size, int batch_size, int scale_wh, float *scales_c, float *out)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        if (scale_wh) {
+            int osd_index = index % channel_size + (index / batch_size)*channel_size;
+
+            out[index] = in_w_h_c[index] * scales_c[osd_index];
+        }
+        else {
+            out[index] = in_w_h_c[index] * scales_c[index / channel_size];
+        }
+    }
+}
+
+void scaleChannelsForward(dnnType *in_w_h_c, int size, int channel_size, int batch_size, int scale_wh, 
+    dnnType *scales_c, dnnType *out, cudaStream_t stream)
+{
+    int blocks = (size+255)/256;
+    int threads = 256;
+
+    scale_channels_kernel <<<blocks, threads, 0, stream>>>(in_w_h_c, size, channel_size, batch_size, scale_wh, scales_c, out);
+}