feat/convolution: change convolution functions to require workspace

BREAKING CHANGE: All convolution functions now require a SharedTensor<u8> workspace to be passed. This allows for reuse of the workspace between different convolution operations and a global shared workspace. REFERENCE #27
autumnai · Mar 2, 2016 · f9d4013 · f9d4013
1 parent 6d0851f
commit f9d4013
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 65 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,9 +15,10 @@ license = "MIT OR Apache-2.0"
 
 [dependencies]
 collenchyma = { version = "0.0.8", default-features = false }
-cudnn = { version = "1.2.1", optional = true }
+cudnn = { version = "1.3.0", optional = true }
 libc = "0.2"
 lazy_static = "0.1"
+log = "0.3.2"
 
 clippy = { version = "0.0.27", optional = true }
 

diff --git a/src/frameworks/cuda/helper.rs b/src/frameworks/cuda/helper.rs
@@ -510,25 +510,29 @@ macro_rules! impl_ops_convolution_for {
  filter: &mut ::co::tensor::SharedTensor<$t>,
  x: &mut ::co::tensor::SharedTensor<$t>,
  result: &mut ::co::tensor::SharedTensor<$t>,
+ workspace: &mut ::co::tensor::SharedTensor<u8>,
  config: &Self::CC //::frameworks::cuda::CC
  ) -> Result<(), ::co::error::Error> {
  match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
  match result.add_device(self.device()) { _ => () }
+ match workspace.add_device(self.device()) { _ => () }
 
- self.convolution_plain(filter, x, result, config)
+ self.convolution_plain(filter, x, result, workspace, config)
  }
 
  fn convolution_plain(
  &self,
  filter: &::co::tensor::SharedTensor<$t>,
  x: &::co::tensor::SharedTensor<$t>,
  result: &mut ::co::tensor::SharedTensor<$t>,
+ workspace: &mut ::co::tensor::SharedTensor<u8>,
  config: &Self::CC
  ) -> Result<(), ::co::error::Error> {
  let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
 
  Ok(try!(match CUDNN.convolution_forward(
  config,
+ try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }),
  try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }),
  &try!(x.cudnn_tensor_desc()), // src_desc
  try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data
@@ -549,13 +553,15 @@ macro_rules! impl_ops_convolution_for {
  src_data: &mut ::co::tensor::SharedTensor<$t>,
  dest_diff: &mut ::co::tensor::SharedTensor<$t>,
  filter_diff: &mut ::co::tensor::SharedTensor<$t>,
+ workspace: &mut ::co::tensor::SharedTensor<u8>,
  config: &Self::CC
  ) -> Result<(), ::co::error::Error> {
  match src_data.add_device(self.device()) { _ => try!(src_data.sync(self.device())) }
  match dest_diff.add_device(self.device()) { _ => try!(dest_diff.sync(self.device())) }
  match filter_diff.add_device(self.device()) { _ => try!(filter_diff.sync(self.device())) }
+ match workspace.add_device(self.device()) { _ => () }
 
- self.convolution_grad_filter_plain(src_data, dest_diff, filter_diff, config)
+ self.convolution_grad_filter_plain(src_data, dest_diff, filter_diff, workspace, config)
  }
 
  #[allow(unused_variables)]
@@ -564,12 +570,14 @@ macro_rules! impl_ops_convolution_for {
  src_data: &::co::tensor::SharedTensor<$t>,
  dest_diff: &::co::tensor::SharedTensor<$t>,
  filter_diff: &mut ::co::tensor::SharedTensor<$t>,
+ workspace: &mut ::co::tensor::SharedTensor<u8>,
  config: &Self::CC
  ) -> Result<(), ::co::error::Error> {
  let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
 
  Ok(try!(match CUDNN.convolution_backward_filter(
  config,
+ try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }),
  &try!(src_data.cudnn_tensor_desc()),
  try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(src_data, self.device()) }),
  &try!(dest_diff.cudnn_tensor_desc()),
@@ -590,13 +598,15 @@ macro_rules! impl_ops_convolution_for {
  filter: &mut ::co::tensor::SharedTensor<$t>,
  x_diff: &mut ::co::tensor::SharedTensor<$t>,
  result_diff: &mut ::co::tensor::SharedTensor<$t>,
+ workspace: &mut ::co::tensor::SharedTensor<u8>,
  config: &Self::CC
  ) -> Result<(), ::co::error::Error> {
  match filter.add_device(self.device()) { _ => try!(filter.sync(self.device())) }
  match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
  match result_diff.add_device(self.device()) { _ => try!(result_diff.sync(self.device())) }
+ match workspace.add_device(self.device()) { _ => () }
 
- self.convolution_grad_data_plain(filter, x_diff, result_diff, config)
+ self.convolution_grad_data_plain(filter, x_diff, result_diff, workspace, config)
  }
 
  #[allow(unused_variables)]
@@ -605,12 +615,14 @@ macro_rules! impl_ops_convolution_for {
  filter: &::co::tensor::SharedTensor<$t>,
  x_diff: &::co::tensor::SharedTensor<$t>,
  result_diff: &mut ::co::tensor::SharedTensor<$t>,
+ workspace: &mut ::co::tensor::SharedTensor<u8>,
  config: &Self::CC
  ) -> Result<(), ::co::error::Error> {
  let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default();
 
  Ok(try!(match CUDNN.convolution_backward_data(
  config,
+ try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }),
  try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }),
  &try!(x_diff.cudnn_tensor_desc()),
  try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }),

diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs
@@ -274,6 +274,12 @@ impl ConvBackwardDataAlgo {
 
 macro_rules! impl_convolution_for_cuda_backend {
  ($t:ty, $cutype:path) => (
+ impl ConvolutionConfig<$t> for ::cudnn::utils::ConvolutionConfig {
+ fn workspace_size(&self) -> usize {
+ *self.largest_workspace_size()
+ }
+ }
+
  impl Convolution<$t> for Backend<Cuda> {
  fn new_convolution_config(
  &self,
@@ -292,53 +298,29 @@ macro_rules! impl_convolution_for_cuda_backend {
  let conv_desc = ::cudnn::ConvolutionDescriptor::new(zero_padding, stride, $cutype).unwrap();
 
  let useable_algo_fwd = try!(algo_fwd.find_cudnn_algo(&filter_desc, &conv_desc, &src_desc, &dest_desc));
- let (workspace_fwd, workspace_size_fwd) = match try!(useable_algo_fwd.needs_cudnn_workspace()) {
- false => (::co::frameworks::cuda::Memory::from_c(0), 0),
- true => {
- let workspace_size_fwd = API::get_convolution_forward_workspace_size(*CUDNN.id_c(), useable_algo_fwd.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap();
- let workspace_forward = ::co::frameworks::cuda::Memory::new(workspace_size_fwd).unwrap();
- (workspace_forward, workspace_size_fwd)
- }
- };
-
  let useable_algo_bwd_filter = try!(algo_bwd_filter.find_cudnn_algo(&filter_desc, &conv_desc, &src_desc, &dest_desc));
- let (workspace_bwd_filter, workspace_size_bwd_filter) = match try!(useable_algo_bwd_filter.needs_cudnn_workspace()) {
- false => (::co::frameworks::cuda::Memory::from_c(0), 0),
- true => {
- let workspace_size_bwd_filter = API::get_convolution_backward_filter_workspace_size(*CUDNN.id_c(), useable_algo_bwd_filter.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap();
- let workspace_backward = ::co::frameworks::cuda::Memory::new(workspace_size_bwd_filter).unwrap();
- (workspace_backward, workspace_size_bwd_filter)
- }
- };
-
  let useable_algo_bwd_data = try!(algo_bwd_data.find_cudnn_algo(&filter_desc, &conv_desc, &src_desc, &dest_desc));
- let (workspace_bwd_data, workspace_size_bwd_data) = match try!(useable_algo_bwd_data.needs_cudnn_workspace()) {
- false => (::co::frameworks::cuda::Memory::from_c(0), 0),
- true => {
- let workspace_size_bwd_data = API::get_convolution_backward_data_workspace_size(*CUDNN.id_c(), useable_algo_bwd_data.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap();
- let workspace_backward = ::co::frameworks::cuda::Memory::new(workspace_size_bwd_data).unwrap();
- (workspace_backward, workspace_size_bwd_data)
- }
+
+ let workspace_size_fwd = match try!(useable_algo_fwd.needs_cudnn_workspace()) {
+ false => 0,
+ true => API::get_convolution_forward_workspace_size(*CUDNN.id_c(), useable_algo_fwd.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap(),
  };
 
- // share one workspace to reduce memory
- let workspace: ::co::frameworks::cuda::Memory;
- if workspace_size_bwd_data >= workspace_size_bwd_filter && workspace_size_bwd_data >= workspace_size_fwd {
- workspace = workspace_bwd_data;
- } else if workspace_size_bwd_filter >= workspace_size_bwd_data && workspace_size_bwd_filter >= workspace_size_fwd {
- workspace = workspace_bwd_filter;
- } else {
- workspace = workspace_fwd;
- }
+ let workspace_size_bwd_filter = match try!(useable_algo_bwd_filter.needs_cudnn_workspace()) {
+ false => 0,
+ true => API::get_convolution_backward_filter_workspace_size(*CUDNN.id_c(), useable_algo_bwd_filter.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap(),
+ };
 
- let workspace_bwd_filter = ::co::frameworks::cuda::Memory::from_c(*workspace.id_c());
- let workspace_fwd = ::co::frameworks::cuda::Memory::from_c(*workspace.id_c());
+ let workspace_size_bwd_data = match try!(useable_algo_bwd_data.needs_cudnn_workspace()) {
+ false => 0,
+ true => API::get_convolution_backward_data_workspace_size(*CUDNN.id_c(), useable_algo_bwd_data.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap(),
+ };
 
  Ok(
  ::cudnn::utils::ConvolutionConfig::new(
- useable_algo_fwd.as_cudnn().unwrap(), workspace_fwd, workspace_size_fwd,
- useable_algo_bwd_filter.as_cudnn().unwrap(), workspace_bwd_filter, workspace_size_bwd_filter,
- useable_algo_bwd_data.as_cudnn().unwrap(), workspace, workspace_size_bwd_data,
+ useable_algo_fwd.as_cudnn().unwrap(), workspace_size_fwd,
+ useable_algo_bwd_filter.as_cudnn().unwrap(), workspace_size_bwd_filter,
+ useable_algo_bwd_data.as_cudnn().unwrap(), workspace_size_bwd_data,
  conv_desc, filter_desc
  )
  )

diff --git a/src/frameworks/native/helper.rs b/src/frameworks/native/helper.rs
@@ -72,6 +72,7 @@ pub fn tanh_grad<T: Float>(x: &T, dx: &T) -> T {
 macro_rules! impl_oconf_for_cc(($($t: ident), +) => (
  $(
  impl<'a> NNOperationConfig<$t> for ::frameworks::native::helper::ConvolutionConfig { }
+ impl<'a> ConvolutionConfig<$t> for ::frameworks::native::helper::ConvolutionConfig { }
  )+
 ));
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -37,7 +37,7 @@
 //! extern crate collenchyma_nn as nn;
 //! # #[cfg(feature = "cuda")]
 //! # mod cuda {
-//! use co::backend::{Backend, BackendConfig};
+//! use co::backend::{Backend, BackendConfig, IBackend};
 //! use co::framework::IFramework;
 //! use co::frameworks::{Cuda, Native};
 //! use co::memory::MemoryType;
@@ -56,10 +56,7 @@
 //! pub fn main() {
 //! // Initialize a CUDA Backend.
 //! // Usually you would not use CUDA but let Collenchyma pick what is available on the machine.
-//! let framework = Cuda::new();
-//! let hardwares = framework.hardwares();
-//! let backend_config = BackendConfig::new(framework, hardwares);
-//! let backend = Backend::new(backend_config).unwrap();
+//! let backend = Backend::<Cuda>::default().unwrap();
 //! // Initialize two SharedTensors.
 //! let mut x = SharedTensor::<f32>::new(backend.device(), &(1, 1, 3)).unwrap();
 //! let mut result = SharedTensor::<f32>::new(backend.device(), &(1, 1, 3)).unwrap();
@@ -132,6 +129,8 @@ extern crate cudnn;
 extern crate libc;
 #[macro_use]
 extern crate lazy_static;
+#[macro_use]
+extern crate log;
 
 pub use plugin::*;
 

diff --git a/src/plugin.rs b/src/plugin.rs
@@ -117,10 +117,21 @@ impl ConvBackwardDataAlgo {
 /// Needs to be implemented for Operation specific configurations.
 pub trait NNOperationConfig<F> {}
 
+/// Provides Convlution Config functionality.
+///
+/// Needs to be implemented for Operation specific configurations.
+pub trait ConvolutionConfig<F> {
+ /// Returns the largest workspace size in bytes needed
+ /// for any of the convolution operations.
+ fn workspace_size(&self) -> usize {
+ 0
+ }
+}
+
 /// Provides the functionality for a backend to support Neural Network related operations.
 pub trait NN<F> {
  /// The Convolution Operation Config representation for this Plugin.
- type CC: NNOperationConfig<F>;
+ type CC: NNOperationConfig<F> + ConvolutionConfig<F>;
  /// The LRN Operation Config representation for this Plugin.
  type CLRN: NNOperationConfig<F>;
  /// The Pooling Operation Config representation for this Plugin.
@@ -368,7 +379,7 @@ pub trait Convolution<F> : NN<F> {
  /// Saves the result to `result`.
  ///
  /// For a no-memory managed version see `convolution_plain`.
- fn convolution(&self, filter: &mut SharedTensor<F>, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>, config: &Self::CC) -> Result<(), ::co::error::Error>;
+ fn convolution(&self, filter: &mut SharedTensor<F>, x: &mut SharedTensor<F>, result: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
 
  /// Computes the convolution over the input Tensor `x` without any memory management.
  ///
@@ -377,15 +388,15 @@ pub trait Convolution<F> : NN<F> {
  /// *Attention*:<br/>
  /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
  /// For a memory managed version see `convolution`.
- fn convolution_plain(&self, filter: &SharedTensor<F>, x: &SharedTensor<F>, result: &mut SharedTensor<F>, config: &Self::CC) -> Result<(), ::co::error::Error>;
+ fn convolution_plain(&self, filter: &SharedTensor<F>, x: &SharedTensor<F>, result: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
 
  /// Computes the gradient of a [CNN convolution][convolution] with respect to the filter and complete memory management.
  /// [convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
  ///
  /// Saves the result to `filter_diff`.
  ///
  /// For a no-memory managed version see `convolution_grad_filter_plain`.
- fn convolution_grad_filter(&self, src_data: &mut SharedTensor<F>, dest_diff: &mut SharedTensor<F>, filter_diff: &mut SharedTensor<F>, config: &Self::CC) -> Result<(), ::co::error::Error>;
+ fn convolution_grad_filter(&self, src_data: &mut SharedTensor<F>, dest_diff: &mut SharedTensor<F>, filter_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
 
  /// Computes the gradient of a convolution with respect to the filter and without any memory management.
  ///
@@ -394,15 +405,15 @@ pub trait Convolution<F> : NN<F> {
  /// *Attention*:<br/>
  /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
  /// For a memory managed version see `convolution_grad_filter`.
- fn convolution_grad_filter_plain(&self, src_data: &SharedTensor<F>, dest_diff: &SharedTensor<F>, filter_diff: &mut SharedTensor<F>, config: &Self::CC) -> Result<(), ::co::error::Error>;
+ fn convolution_grad_filter_plain(&self, src_data: &SharedTensor<F>, dest_diff: &SharedTensor<F>, filter_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
 
  /// Computes the gradient of a [CNN convolution][convolution] over the input Tensor `x` with respect to the data and complete memory management.
  /// [convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
  ///
  /// Saves the result to `result_diff`.
  ///
  /// For a no-memory managed version see `convolution_grad_data_plain`.
- fn convolution_grad_data(&self, filter: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>, config: &Self::CC) -> Result<(), ::co::error::Error>;
+ fn convolution_grad_data(&self, filter: &mut SharedTensor<F>, x_diff: &mut SharedTensor<F>, result_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
 
  /// Computes the gradient of a convolution over the input Tensor `x` with respect to the data and without any memory management.
  ///
@@ -411,7 +422,7 @@ pub trait Convolution<F> : NN<F> {
  /// *Attention*:<br/>
  /// For a correct computation result, you need to manage the memory allocation and synchronization yourself.<br/>
  /// For a memory managed version see `convolution_grad_data`.
- fn convolution_grad_data_plain(&self, filter: &SharedTensor<F>, x_diff: &SharedTensor<F>, result_diff: &mut SharedTensor<F>, config: &Self::CC) -> Result<(), ::co::error::Error>;
+ fn convolution_grad_data_plain(&self, filter: &SharedTensor<F>, x_diff: &SharedTensor<F>, result_diff: &mut SharedTensor<F>, workspace: &mut SharedTensor<u8>, config: &Self::CC) -> Result<(), ::co::error::Error>;
 }
 
  // /// Computes the backward Convolution function w.r.t the bias.