CUDNN dropout (apache#13896)

* cudnn dropout * test dropout as stateful op * add cudnn_off * refactor * fix bug when using inf forward * turn on cudnn in gluon * reuse dropout state space * dropout passthrough * address comments
haohuanw · Jun 23, 2019 · 48aa673 · 48aa673
1 parent 5bbbed6
commit 48aa673
Show file tree

Hide file tree

Showing 13 changed files with 604 additions and 195 deletions.
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
@@ -227,6 +227,26 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
  - Maximum value is 60.
  - This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD).
 
+* MXNET_CPU_TEMP_COPY
+ - Values: Int ```(default=4)```
+ - This variable controls how many temporary memory resources to create for all CPU context for use in operator.
+
+* MXNET_GPU_TEMP_COPY
+ - Values: Int ```(default=1)```
+ - This variable controls how many temporary memory resources to create for each GPU context for use in operator.
+
+* MXNET_CPU_PARALLEL_RAND_COPY
+ - Values: Int ```(default=1)```
+ - This variable controls how many parallel random number generator resources to create for all CPU context for use in operator.
+
+* MXNET_GPU_PARALLEL_RAND_COPY
+ - Values: Int ```(default=4)```
+ - This variable controls how many parallel random number generator resources to create for each GPU context for use in operator.
+
+* MXNET_GPU_CUDNN_DROPOUT_STATE_COPY
+ - Values: Int ```(default=4)```
+ - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```

diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
@@ -44,6 +44,11 @@ struct ResourceRequest {
  kTempSpace,
  /*! \brief common::RandGenerator<xpu> object, which can be used in GPU kernel functions */
  kParallelRandom
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+ ,
+ /*! \brief cudnnDropoutDescriptor_t object for GPU dropout kernel functions */
+ kCuDNNDropoutDesc
+#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
  };
  /*! \brief type of resources */
  Type type;
@@ -157,6 +162,21 @@ struct Resource {
  reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType))),
  shape, shape[ndim - 1], stream);
  }
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+ /*!
+ * \brief Get cudnn dropout descriptor from shared state space.
+ *
+ * \param dropout_desc reference to previously created cudnn dropout descriptor.
+ * \param stream the stream of retruning tensor.
+ * \return the mshadow tensor requested.
+ */
+ void get_cudnn_dropout_desc(
+ cudnnDropoutDescriptor_t* dropout_desc,
+ mshadow::Stream<gpu> *stream,
+ const float dropout,
+ uint64_t seed) const;
+#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+
  /*!
  * \brief Get CPU space as mshadow Tensor in specified type.
  * The caller can request arbitrary size.

diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
@@ -262,7 +262,10 @@ def __init__(self, rate, axes=(), **kwargs):
  self._axes = axes
 
  def hybrid_forward(self, F, x):
- return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd')
+ if self._rate > 0:
+ return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
+ else:
+ return F.identity(x)
 
  def __repr__(self):
  s = '{name}(p = {_rate}, axes={_axes})'

diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
@@ -62,20 +62,34 @@ void AttachOpResources(
  : fresource[op](inode.source->attrs);
  // Get the resource of temporal space.
  for (const ResourceRequest& req : reqs) {
- if (req.type == ResourceRequest::kTempSpace) {
- if (cached_temp.count(ctx) != 0) {
- requested.push_back(cached_temp.at(ctx));
- } else {
- Resource r = ResourceManager::Get()->Request(ctx, req);
- requested.push_back(r);
- cached_temp[ctx] = r;
+ switch (req.type) {
+ case ResourceRequest::kTempSpace: {
+ // the scope is needed when there's new declaration of variable.
+ if (cached_temp.count(ctx) != 0) {
+ requested.push_back(cached_temp.at(ctx));
+ } else {
+ Resource r = ResourceManager::Get()->Request(ctx, req);
+ requested.push_back(r);
+ cached_temp[ctx] = r;
+ }
+ break;
  }
- } else if (req.type == ResourceRequest::kRandom) {
- requested.push_back(ResourceManager::Get()->Request(ctx, req));
- } else if (req.type == ResourceRequest::kParallelRandom) {
- requested.push_back(ResourceManager::Get()->Request(ctx, req));
- } else {
- LOG(FATAL) << "resource type not yet supported";
+ case ResourceRequest::kRandom: {
+ requested.push_back(ResourceManager::Get()->Request(ctx, req));
+ break;
+ }
+ case ResourceRequest::kParallelRandom: {
+ requested.push_back(ResourceManager::Get()->Request(ctx, req));
+ break;
+ }
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+ case ResourceRequest::kCuDNNDropoutDesc: {
+ requested.push_back(ResourceManager::Get()->Request(ctx, req));
+ break;
+ }
+#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+ default:
+ LOG(FATAL) << "resource type " << req.type << " is not yet supported";
  }
  }
  CHECK(vdispatch[nid] != DispatchMode::kUndefined);

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
@@ -241,6 +241,12 @@ inline void SetDependency(const nnvm::NodeAttrs& attrs,
  requested.push_back(ResourceManager::Get()->Request(ctx, req));
  write_vars.push_back(requested.back().var);
  break;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+ case ResourceRequest::kCuDNNDropoutDesc:
+ requested.push_back(ResourceManager::Get()->Request(ctx, req));
+ write_vars.push_back(requested.back().var);
+ break;
+#endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
  default:
  LOG(FATAL) << "resource type not yet supported";
  }

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
@@ -699,7 +699,7 @@ class CuDNNRNNOp : public Operator {
  if (param_.p > 0) {
  CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
  dropout_size_ = dropout_byte_ / sizeof(DType);
- dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU());
+ dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
  } else {
  dropout_states_ = {};
  dropout_byte_ = 0;
@@ -764,7 +764,7 @@ class CuDNNRNNOp : public Operator {
  &reserve_space_byte_));
  workspace_size_ = workspace_byte_ / sizeof(DType);
  // Allocate the reserve space
- reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU());
+ reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
 
  // Check that number of params are correct
  size_t cudnn_param_size;