include/mxnet/base.h

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file base.h
 * \brief configuration of MXNet as well as basic data structure.
 */
#ifndef MXNET_BASE_H_
#define MXNET_BASE_H_

#include "dmlc/base.h"
#include <string>
#include "dmlc/io.h"
#include "dmlc/type_traits.h"
#include "dmlc/parameter.h"
#include "mshadow/tensor.h"
// nnvm headers for symbolic construction.
#include "nnvm/op.h"
#include "nnvm/symbolic.h"
#include "libinfo.h"
#include "tuple.h"

/*!
 * \brief define dllexport for Visual Studio
 */
#ifdef _MSC_VER
#ifdef MXNET_EXPORTS
#define MXNET_API __declspec(dllexport)
#else
#define MXNET_API __declspec(dllimport)
#endif
#else
#define MXNET_API
#endif

/*!
 * \brief define prediction only
 */
#ifndef MXNET_PREDICT_ONLY
#define MXNET_PREDICT_ONLY 0
#endif

/*! \brief major version */
#define MXNET_MAJOR 2
/*! \brief minor version */
#define MXNET_MINOR 0
/*! \brief patch version */
#define MXNET_PATCH 0
/*! \brief mxnet version */
#define MXNET_VERSION (MXNET_MAJOR * 10000 + MXNET_MINOR * 100 + MXNET_PATCH)
/*! \brief helper for making version number */
#define MXNET_MAKE_VERSION(major, minor, patch) ((major)*10000 + (minor)*100 + patch)
/*!
 * \brief define function name as profiler message
 */
#define PROFILER_MESSAGE_FUNCNAME (__FUNCTION__)

/*! \brief namespace of mxnet */
namespace mxnet {
/*! \brief mxnet cpu */
typedef mshadow::cpu cpu;
/*! \brief mxnet gpu */
typedef mshadow::gpu gpu;
/*! \brief index type usually use unsigned */
typedef mshadow::index_t index_t;
/*! \brief index type for blas library.*/
typedef mshadow::lapack_index_t lapack_index_t;
/*! \brief data type that will be used to store ndarray */
typedef mshadow::default_real_t real_t;
/*! \brief operator structure from NNVM */
using Op = nnvm::Op;

/*! \brief Context information about the execution environment */
struct Context {
  /*! \brief Type of device */
  enum DeviceType {
    kCPU       = cpu::kDevMask,
    kGPU       = gpu::kDevMask,
    kCPUPinned = 3,
    kCPUShared = 5,
  };
  /*! \brief the device type we run the op on */
  DeviceType dev_type;
  /*! \brief device id we are going to run it on */
  int32_t dev_id;
  /*! \brief default constructor */
  Context() : dev_type(kCPU), dev_id(0) {}
  /*!
   * \brief Get corresponding device mask
   * \return cpu::kDevMask or gpu::kDevMask
   */
  inline DeviceType dev_mask() const {
    if (dev_type == kCPUPinned || dev_type == kCPUShared)
      return kCPU;
    return dev_type;
  }
  /*!
   * \brief Returns dev_id for kGPU and kCPUPinned, 0 otherwise
   */
  inline int real_dev_id() const {
    if (dev_type == kCPUPinned || dev_type == kGPU)
      return dev_id;
    return 0;
  }
  /*!
   * \brief Comparator, used to enable Context as std::map key.
   * \param b another context to compare
   * \return compared result
   */
  inline bool operator<(const Context& b) const;
  /*!
   * \brief check if current context equals another one
   * \param b another context to compare
   * \return whether dev mask and id are same
   */
  inline bool operator==(const Context& b) const {
    return dev_type == b.dev_type && dev_id == b.dev_id;
  }
  /*!
   * \brief check if current context not equals another one
   * \param b another context to compare
   * \return whether they are not the same
   */
  inline bool operator!=(const Context& b) const {
    return !(*this == b);
  }
  /*!
   * \brief save the content into binary stream
   * \param strm the output stream
   */
  inline void Save(dmlc::Stream* strm) const {
    strm->Write(&dev_type, sizeof(dev_type));
    strm->Write(&dev_id, sizeof(dev_id));
  }
  /*!
   * \brief load the content from binary stream
   * \param strm the output stream
   * \return whether the load is successful
   */
  inline bool Load(dmlc::Stream* strm) {
    if (strm->Read(&dev_type, sizeof(dev_type)) != sizeof(dev_type))
      return false;
    if (strm->Read(&dev_id, sizeof(int32_t)) != sizeof(int32_t))
      return false;
    return true;
  }
  /*! \brief the maximal device type */
  static const int32_t kMaxDevType = 6;
  /*! \brief the maximal device index */
  static const int32_t kMaxDevID = 16;
  /*!
   * \brief Create a new context.
   * \param dev_type device type.
   * \param dev_id device id. -1 for current device.
   */
  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);
  /*! \return CPU Context */
  inline static Context CPU(int32_t dev_id = 0);
  /*!
   * Create a GPU context.
   * \param dev_id the device id.
   * \return GPU Context. -1 for current GPU.
   */
  inline static Context GPU(int32_t dev_id = -1);
  /*!
   * Get the number of GPUs available.
   * \return The number of GPUs that are available.
   */
  inline static int32_t GetGPUCount();
  /*!
   * Is the cuda driver installed and visible to the system.
   * \return Whether the driver is present.
   */
  inline static bool GPUDriverPresent();
  /*!
   * Get the number of streams that a GPU Worker has available to operations.
   * \return The number of streams that are available.
   */
  inline static int32_t GetGPUStreamsPerWorker();
  /*!
   * \brief get the free and total available memory on a GPU
   * \param dev the GPU number to query
   * \param free_mem pointer to the uint64_t holding free GPU memory
   * \param total_mem pointer to the uint64_t holding total GPU memory
   * \return No return value
   */
  inline static void GetGPUMemoryInformation(int dev, uint64_t* free, uint64_t* total);
  /*!
   * Create a pinned CPU context.
   * \param dev_id the device id for corresponding GPU.
   * \return Pinned CPU context. -1 for current GPU.
   */
  inline static Context CPUPinned(int32_t dev_id = -1);
  /*!
   * Create a CPU shared memory context.
   * \param dev_id dummy device id.
   * \return CPU shared memory context.
   */
  inline static Context CPUShared(int32_t dev_id = 0);
  /*!
   * Create a context from string of the format [cpu|gpu|cpu_pinned](n)
   * \param str the string pattern
   * \return Context
   */
  inline static Context FromString(const std::string& str);

 private:
#if MXNET_USE_CUDA
  static void CudaLibChecks();
#endif
#if MXNET_USE_CUDNN
  static void CuDNNLibChecks();
#endif
};

#if MXNET_USE_CUDA
/*! \brief Holds an auxiliary mshadow gpu stream that can be synced with a primary stream. */
class GPUAuxStream {
 public:
  /*!
   * \brief constructor.
   * \param primary_stream gpu stream that is synced with the created auxiliary stream.
   */
  explicit GPUAuxStream(mshadow::Stream<gpu>* primary_stream)
      : primary_stream_(primary_stream),
        aux_stream_(primary_stream),
        gpu_stream_sync_event_(nullptr) {
    if (Context::GetGPUStreamsPerWorker() >= 2) {
      // Create auxiliary stream on the same device with the same properties as the primary stream
      bool primary_has_blas_handle =
          primary_stream->blas_handle_ownership_ == mshadow::Stream<gpu>::OwnHandle;
      bool primary_has_dnn_handle =
          primary_stream->dnn_handle_ownership_ == mshadow::Stream<gpu>::OwnHandle;
      aux_stream_ = mshadow::NewStream<gpu>(
          primary_has_blas_handle, primary_has_dnn_handle, primary_stream->dev_id);
      MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming));
    }
  }
  /*! \brief destructor */
  ~GPUAuxStream() {
    // If the aux_stream_ == primary_stream_, then we created no new streams to destroy.
    if (aux_stream_ != primary_stream_) {
      MSHADOW_CATCH_ERROR(mshadow::DeleteStream<gpu>(aux_stream_));
      MSHADOW_CATCH_ERROR(cudaEventDestroy(gpu_stream_sync_event_));
    }
  }
  /*!
   * \brief Makes future aux stream work wait on the completion of existing primary stream work.
   */
  void PreAuxStreamUseSync() {
    // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
    if (aux_stream_ != primary_stream_)
      StreamSync(primary_stream_, aux_stream_, gpu_stream_sync_event_);
  }
  /*!
   * \brief Makes future primary stream work wait on the completion of existing aux stream work.
   */
  void PostAuxStreamUseSync() {
    // If the aux_stream_ == primary_stream_, then no synchronization is necessary.
    if (aux_stream_ != primary_stream_)
      StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_);
  }
  /*! \brief Getter for created auxiliary stream. */
  mshadow::Stream<gpu>* GetStream() {
    return aux_stream_;
  }
  /*!
   * \brief Make future work enqueued to `s2` wait on completion of current work enqueued to `s1`.
   * \param s1 stream with work that must be completed before future s2 work can begin.
   * \param s2 stream whose future work is made to wait on the completion of existing s1 work.
   * \param event used to pass s1 state to s2.
   */
  static void StreamSync(mshadow::Stream<gpu>* s1, mshadow::Stream<gpu>* s2, cudaEvent_t event) {
    MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_));
    MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0));
  }

 private:
  mshadow::Stream<gpu>* primary_stream_;
  mshadow::Stream<gpu>* aux_stream_;
  cudaEvent_t gpu_stream_sync_event_;
};

/*!
 * \brief Provides automatic coordination of an auxilary stream with a primary one.
 * This object, upon construction, prepares an aux stream for use by syncing it with enqueued
 * primary-stream work.  Object destruction will sync again so future primary-stream work
 * will wait on enqueued aux-stream work.  If MXNET_GPU_WORKER_NSTREAMS == 1, then this defaults
 * simply: the primary stream will equal the aux stream and the syncs will be executed as nops.
 * See ./src/operator/cudnn/cudnn_convolution-inl.h for a usage example.
 */
class SyncedGPUAuxStream {
 public:
  /*!
   * \brief constructor.
   * \param gpu_aux_stream auxilary gpu stream that is managed by this RAII object.
   */
  explicit SyncedGPUAuxStream(GPUAuxStream* gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) {
    gpu_aux_stream_->PreAuxStreamUseSync();
  }
  /*! \brief destructor */
  ~SyncedGPUAuxStream() {
    gpu_aux_stream_->PostAuxStreamUseSync();
  }
  /*! \brief copy constructor deleted to prevent unexpected synchronizations. */
  SyncedGPUAuxStream(const SyncedGPUAuxStream&) = delete;
  /*! \brief copy assignment operator deleted to prevent unexpected synchronizations. */
  void operator=(const SyncedGPUAuxStream&) = delete;
  /*! \brief move constructor permitted as alternative to copying. */
  SyncedGPUAuxStream(SyncedGPUAuxStream&&) = default;
  /*! \brief move assignment operator permitted as alternative to copy assignment. */
  SyncedGPUAuxStream& operator=(SyncedGPUAuxStream&&) = default;
  /*! \brief Getter for underlying mshadow::Stream<gpu>. */
  inline mshadow::Stream<gpu>* GetStream() const {
    return gpu_aux_stream_->GetStream();
  }

 private:
  GPUAuxStream* gpu_aux_stream_;
};
#endif  // MXNET_USE_CUDA

/*!
 * \brief execution time context.
 *  The information needed in runtime for actual execution.
 */
struct RunContext {
  /*! \brief base Context */
  Context ctx;
  /*!
   * \brief the stream of the device, can be nullptr or Stream<gpu>* in GPU mode
   */
  void* stream;
  /*!
   * \brief the auxiliary stream of the device, can be nullptr or Stream<gpu>* in GPU mode
   */
  void* aux_stream;
  /*!
   * \brief pointer to the cuda event pool used by the dependency engine
   */
  void* event_pool = nullptr;
  /*!
   * \brief get mshadow stream from Context
   * \return the mshadow stream
   * \tparam xpu the device type of the stream
   */
  template <typename xpu>
  inline mshadow::Stream<xpu>* get_stream() const {
    return static_cast<mshadow::Stream<xpu>*>(stream);
  }
#if MXNET_USE_CUDA
  /*!
   * \brief get an RAII object that transparently handles the syncing of the auxiliary stream.
   * \return the aux stream auto-syncing object
   */
  inline SyncedGPUAuxStream get_gpu_aux_stream() const {
    return SyncedGPUAuxStream(static_cast<GPUAuxStream*>(aux_stream));
  }
#endif
  /*! \brief get the base Context from RunContext */
  inline const Context& get_ctx() const {
    return ctx;
  }
};
}  // namespace mxnet

//! \cond Doxygen_Suppress
namespace mxnet {
// implementing Context
inline bool Context::operator<(const Context& b) const {
  if (dev_type == b.dev_type) {
    return dev_id < b.dev_id;
  } else {
    return dev_type < b.dev_type;
  }
}
inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
  Context ctx;
  ctx.dev_type = dev_type;
  ctx.dev_id   = dev_id < 0 ? 0 : dev_id;
  if (dev_type & kGPU) {
#if MXNET_USE_CUDA
    CudaLibChecks();
#endif
#if MXNET_USE_CUDNN
    CuDNNLibChecks();
#endif
    if (dev_id < 0) {
#if MXNET_USE_CUDA
      CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
#else
      LOG(FATAL) << "Please compile with CUDA enabled for cuda features";
#endif
    }
  }
  return ctx;
}
inline Context Context::CPU(int32_t dev_id) {
  return Create(kCPU, dev_id);
}

inline Context Context::CPUPinned(int32_t dev_id) {
  return Create(kCPUPinned, dev_id);
}

inline Context Context::CPUShared(int32_t dev_id) {
  return Create(kCPUShared, dev_id);
}

inline Context Context::GPU(int32_t dev_id) {
  return Create(kGPU, dev_id);
}

inline bool Context::GPUDriverPresent() {
#if MXNET_USE_CUDA
  int cuda_driver_version = 0;
  CHECK_EQ(cudaDriverGetVersion(&cuda_driver_version), cudaSuccess);
  return cuda_driver_version > 0;
#else
  return false;
#endif
}

inline int32_t Context::GetGPUCount() {
#if MXNET_USE_CUDA
  if (!GPUDriverPresent()) {
    return 0;
  }
  int32_t count;
  cudaError_t e = cudaGetDeviceCount(&count);
  // TODO(junwu): Remove e == cudaErrorInsufficientDriver
  // This is skipped for working around wheel build system with older CUDA driver.
  if (e == cudaErrorNoDevice || e == cudaErrorInsufficientDriver) {
    return 0;
  }
  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
  return count;
#else
  return 0;
#endif
}

inline int32_t Context::GetGPUStreamsPerWorker() {
  // The default number of streams available if the user has not set MXNET_GPU_WORKER_NSTREAMS.
  const int32_t default_num_streams = 1;
  // The get_aux_stream() interface can supply one additional stream beyond the standard one.
  static int32_t num_streams =
      dmlc::GetEnv("MXNET_GPU_WORKER_NSTREAMS", default_num_streams) >= 2 ? 2 : 1;
  return num_streams;
}

inline void Context::GetGPUMemoryInformation(int dev, uint64_t* free_mem, uint64_t* total_mem) {
#if MXNET_USE_CUDA

  size_t memF, memT;
  cudaError_t e;

  int curDevice;
  e = cudaGetDevice(&curDevice);
  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

  e = cudaSetDevice(dev);
  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

  e = cudaMemGetInfo(&memF, &memT);
  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

  e = cudaSetDevice(curDevice);
  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

  *free_mem  = static_cast<uint64_t>(memF);
  *total_mem = static_cast<uint64_t>(memT);

#else
  LOG(FATAL) << "This call is only supported for MXNet built with CUDA support.";
#endif
}

inline Context Context::FromString(const std::string& str) {
  Context ret;
  try {
    const std::string::size_type l = str.find('(');
    CHECK_NE(l, std::string::npos);
    const std::string::size_type r = str.find(')');
    CHECK_EQ(r, str.length() - 1);

    const std::string type = str.substr(0, l);
    int id                 = std::stoi(str.substr(l + 1, r - l - 1));
    if (type == "cpu") {
      ret = CPU(id);
    } else if (type == "gpu") {
      ret = GPU(id);
    } else if (type == "cpu_pinned") {
      ret = CPUPinned(id);
    } else if (type == "cpu_shared") {
      ret = CPUShared(id);
    } else {
      LOG(FATAL) << "Invalid context string " << str;
    }
  } catch (...) {
    LOG(FATAL) << "Invalid context string " << str;
  }
  return ret;
}

inline std::ostream& operator<<(std::ostream& out, const Context& ctx) {
  if (ctx.dev_type == Context::kCPU) {
    out << "cpu(";
  } else if (ctx.dev_type == Context::kGPU) {
    out << "gpu(";
  } else if (ctx.dev_type == Context::kCPUPinned) {
    out << "cpu_pinned(";
  } else if (ctx.dev_type == Context::kCPUShared) {
    out << "cpu_shared(";
  } else {
    out << "unknown(";
  }
  out << ctx.dev_id << ")";
  return out;
}

// describe op registration point
#define STRINGIZE_DETAIL(x) #x
#define STRINGIZE(x)        STRINGIZE_DETAIL(x)
#define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
#define ADD_FILELINE        "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)

#if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
constexpr size_t kDNNLAlign = 64;
#endif

}  // namespace mxnet

namespace std {
template <>
struct hash<mxnet::Context> {
  size_t operator()(const mxnet::Context& ctx) const {
    size_t res = 0;
    res        = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
    res        = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
    return res;
  }
};

#if __cplusplus < 201402L && !defined(_MSC_VER)
template <typename T, typename... Args>
inline std::unique_ptr<T> make_unique(Args&&... args) {
  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
#endif
}  // namespace std

#include "./tensor_blob.h"
//! \endcond
#endif  // MXNET_BASE_H_