src/operator/contrib/ifft-inl.h

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * Copyright (c) 2015 by Contributors
 * \file Ifft-inl.h
 * \brief
 * \author Chen Zhu
*/
#ifndef MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
#define MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
#include <stdio.h>
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <map>
#include <vector>
#include <string>
#include <utility>
#include "../operator_common.h"
#include "../mshadow_op.h"

#if MXNET_USE_CUDA
#include <cufft.h>
#endif

namespace mxnet {
namespace op {
namespace ifft {
  enum ifftOpInputs {kData};  // input should represent complex
  enum ifftOpOutputs {kOut};  // output should be real
  enum ifftOpResource {kTempSpace};
}

struct IFFTParam : public dmlc::Parameter<IFFTParam> {
  int compute_size;  // the maximum size of sub-batch to be forwarded through cufft in one time
  DMLC_DECLARE_PARAMETER(IFFTParam){
    DMLC_DECLARE_FIELD(compute_size).set_default(128)
    .describe("Maximum size of sub-batch to be forwarded at one time");
  }
};

#if MXNET_USE_CUDA
template<typename xpu, typename DType>
class IFFTOp : public Operator {
 public:
  explicit IFFTOp(IFFTParam p) {
    this->param_ = p;
    init_cufft_ = false;
    dim_ = 0;
  }

  virtual void Forward(const OpContext &ctx,
                       const std::vector<TBlob> &in_data,
                       const std::vector<OpReqType> &req,
                       const std::vector<TBlob> &out_data,
                       const std::vector<TBlob> &aux_args) {
    using namespace mshadow;
    using namespace mshadow::expr;
    CHECK_EQ(in_data.size(), 1);
    CHECK_EQ(out_data.size(), 1);

    if (!init_cufft_) {
      n_iffts = in_data[ifft::kData].shape_.ProdShape(0, in_data[ifft::kData].ndim()-1);
      // remember that input is complex
      dim_ = in_data[ifft::kData].shape_[in_data[ifft::kData].ndim()-1]/2;
      // stride_ in the number of complex numbers
      stride_ = param_.compute_size*dim_;

      init_cufft_ = true;

      num_compute = n_iffts/param_.compute_size;
    }

    Stream<xpu> *s = ctx.get_stream<xpu>();
    const mxnet::TShape& ishape = in_data[ifft::kData].shape_;
    const mxnet::TShape& oshape = out_data[ifft::kOut].shape_;
    Tensor<xpu, 2, DType> data = in_data[ifft::kData].get_with_shape<xpu, 2, DType>(
          Shape2(n_iffts, dim_*2), s);
    Tensor<xpu, 2, DType> out = out_data[ifft::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(n_iffts, dim_), s);
    // need temp space to store the intermediate complex matrices
    Tensor<xpu, 1, DType> workspace =
            ctx.requested[ifft::kTempSpace].get_space_typed<xpu, 1, DType>(
                Shape1(param_.compute_size*dim_*2), s);
    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                              Shape2(param_.compute_size, dim_*2), s);
    // start ifft
    cufftHandle plan;
    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
    for (size_t idx=0; idx < num_compute; ++idx) {
      cufftComplex* in_tmp = const_cast<cufftComplex*>(
        reinterpret_cast<const cufftComplex*>(data.dptr_ + 2*idx*stride_));
      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);

      Assign(out.Slice(idx*param_.compute_size, (idx+1)*param_.compute_size),
             req[ifft::kOut], complex_toreal(complex_data));
    }
    cufftDestroy(plan);
    // handle the remaining samples
    size_t remain_num = n_iffts - param_.compute_size*num_compute;
    if (remain_num > 0) {
      cufftHandle plan_remain;
      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
                    CUFFT_C2C, remain_num);

      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                              Shape2(remain_num, dim_*2), s);

      cufftComplex* in_tmp = const_cast<cufftComplex*>(
        reinterpret_cast<const cufftComplex*>(data.dptr_ + 2*num_compute*stride_));
      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
        Assign(out.Slice(param_.compute_size*num_compute,
                         param_.compute_size*num_compute+remain_num),
             req[ifft::kOut], complex_toreal(complex_data));
      cufftDestroy(plan_remain);
    }
    // commenting this out to be consistant with caffe
    // out /= dim_;
  }
  virtual void Backward(const OpContext &ctx,
                        const std::vector<TBlob> &out_grad,
                        const std::vector<TBlob> &in_data,
                        const std::vector<TBlob> &out_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &in_grad,
                        const std::vector<TBlob> &aux_args) {
    using namespace mshadow;
    using namespace mshadow::expr;
    CHECK_EQ(out_grad.size(), 1);
    CHECK(in_data.size() == 1 && in_grad.size() == 1);
    CHECK_EQ(req.size(), 1);

    Stream<xpu> *s = ctx.get_stream<xpu>();

    const mxnet::TShape& ishape = in_grad[ifft::kData].shape_;
    const mxnet::TShape& oshape = out_grad[ifft::kOut].shape_;
    Tensor<xpu, 2, DType> gdata = in_grad[ifft::kData].get_with_shape<xpu, 2, DType>(
          Shape2(n_iffts, dim_*2), s);
    Tensor<xpu, 2, DType> grad = out_grad[ifft::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(n_iffts, dim_), s);
    // need temp space to pad the data into complex numbers due to cufft interface
    Tensor<xpu, 1, DType> workspace =
            ctx.requested[ifft::kTempSpace].get_space_typed<xpu, 1, DType>(
                Shape1(param_.compute_size*dim_*2), s);
    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                              Shape2(param_.compute_size, dim_*2), s);
    // start fft
    cufftHandle plan;
    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
    for (size_t idx = 0; idx < num_compute; ++idx) {
      complex_data = complex_pad_imag(grad.Slice(idx*param_.compute_size,
                                                 idx*param_.compute_size+param_.compute_size));

      cufftComplex* in_tmp = const_cast<cufftComplex*>(
        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(gdata.dptr_ + 2*idx*stride_);
      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
    }
    cufftDestroy(plan);

    // handle the remaining samples
    size_t remain_num = n_iffts - param_.compute_size*num_compute;
    if (remain_num > 0) {
      cufftHandle plan_remain;
      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
                    CUFFT_C2C, remain_num);
      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                          Shape2(remain_num, dim_*2), s);
      complex_data = complex_pad_imag(grad.Slice(
          num_compute*param_.compute_size, num_compute*param_.compute_size+remain_num));

      cufftComplex* in_tmp = const_cast<cufftComplex*>(
        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(gdata.dptr_ + 2*num_compute*stride_);
      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
      cufftDestroy(plan_remain);
    }
    // commenting this out to be consistant with caffe
    // gdata /= dim_;
  }

 private:
  IFFTParam param_;
  int dim_, stride_, n_iffts;
  size_t num_compute;
  bool init_cufft_;
};  // class IFFTOp

#endif  // MXNET_USE_CUDA

// Declare Factory Function, used for dispatch specialization
template<typename xpu>
Operator* CreateOp(IFFTParam param, int dtype);

#if DMLC_USE_CXX11
class IFFTProp : public OperatorProperty {
 public:
  std::vector<std::string> ListArguments() const override {
    return {"data"};
  }

  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
    param_.Init(kwargs);
  }
  std::map<std::string, std::string> GetParams() const override {
    return param_.__DICT__();
  }

  bool InferShape(mxnet::ShapeVector *in_shape,
                  mxnet::ShapeVector *out_shape,
                  mxnet::ShapeVector *aux_shape) const override {
    using namespace mshadow;
    CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
    const mxnet::TShape &dshape = (*in_shape)[ifft::kData];
    // require data to be known
    if (mxnet::op::shape_is_none(dshape)) return false;

    out_shape->clear();
    if (dshape.ndim() == 4) {
      out_shape->push_back(Shape4(dshape[0], dshape[1], dshape[2], dshape[3]/2));
    } else if (dshape.ndim() == 2) {
      out_shape->push_back(Shape2(dshape[0], dshape[1]/2));
    } else {
      return false;
    }
    return true;
  }

  bool InferType(std::vector<int> *in_type,
                 std::vector<int> *out_type,
                 std::vector<int> *aux_type) const override {
    CHECK_GE(in_type->size(), 1);
    int dtype = (*in_type)[0];
    CHECK_NE(dtype, -1) << "First input must have specified type";
    for (size_t i=0; i < in_type->size(); ++i) {
      if ((*in_type)[i] == -1) {
        (*in_type)[i] = dtype;
      } else {
        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
      }
    }
    out_type->clear();
    out_type->push_back(dtype);
    return true;
  }

  OperatorProperty* Copy() const override {
    IFFTProp* ifft_sym = new IFFTProp();
    ifft_sym->param_ = this->param_;
    return ifft_sym;
  }

  std::string TypeString() const override {
    return "_contrib_ifft";
  }

  std::vector<int> DeclareBackwardDependency(
    const std::vector<int> &out_grad,
    const std::vector<int> &in_data,
    const std::vector<int> &out_data) const override {
    return {out_grad[ifft::kOut], in_data[ifft::kData]};
  }

  std::vector<ResourceRequest> ForwardResource(
      const mxnet::ShapeVector &in_shape) const override {
    return {ResourceRequest::kTempSpace};
  }

  std::vector<ResourceRequest> BackwardResource(
      const mxnet::ShapeVector &in_shape) const override {
    return {ResourceRequest::kTempSpace};
  }

  std::vector<std::pair<int, void*> > BackwardInplaceOption(
    const std::vector<int> &out_grad,
    const std::vector<int> &in_data,
    const std::vector<int> &out_data,
    const std::vector<void*> &in_grad) const override {
    return {{in_data[ifft::kData], in_grad[ifft::kData]}};
  }

  Operator* CreateOperator(Context ctx) const override {
    LOG(FATAL) << "Not Implemented.";
    return NULL;
  }

  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
                              std::vector<int> *in_type) const override;

 private:
  IFFTParam param_;
};
#endif
}  // namespace op
}  // namespace mxnet
#endif  // MXNET_OPERATOR_CONTRIB_IFFT_INL_H_