Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add MKLDNN_DEVICE #3712

Merged
merged 13 commits into from
Aug 30, 2017
2 changes: 1 addition & 1 deletion cmake/external/mkldnn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9"
GIT_TAG "v0.10"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/mklml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ INCLUDE(ExternalProject)

SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170720")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
Expand Down
2 changes: 1 addition & 1 deletion paddle/gserver/layers/Layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace paddle {
Layer::Layer(const LayerConfig& config, bool useGpu)
: config_(config),
useGpu_(useGpu),
deviceId_(-1),
deviceId_(CPU_DEVICE),
needSequenceInfo_(true) {}

bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
Expand Down
29 changes: 28 additions & 1 deletion paddle/gserver/layers/Layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ class Layer {
LayerConfig config_;
/// whether to use GPU
bool useGpu_;
/// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
/// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
int deviceId_;
/// Input layers
std::vector<LayerPtr> inputLayers_;
Expand All @@ -77,6 +82,7 @@ class Layer {
Argument output_;
/// Several outputs stored on different devices, used in 'parallel_nn' case,
/// and record them by deviceId_.
/// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name.
std::map<std::string, Argument*> outputMap_;
Expand Down Expand Up @@ -172,6 +178,13 @@ class Layer {
return inputLayer.getOutput(deviceId_);
}

/**
* Get the argument of input layer with deviceId.
*/
const Argument& getInput(size_t inputIndex, int deviceId) const {
return inputLayers_[inputIndex]->getOutput(deviceId);
}

/**
* Get the forward-input value.
*/
Expand All @@ -186,6 +199,13 @@ class Layer {
return inputLayer.getOutput(deviceId_).value;
}

/**
* Get the forward-input value with deviceId.
*/
const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).value;
}

/**
* Get the forward-input grad.
*/
Expand All @@ -200,6 +220,13 @@ class Layer {
return inputLayer.getOutput(deviceId_).grad;
}

/**
* Get the forward-input grad.
*/
const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).grad;
}

/**
* Get the forward-input label.
*/
Expand Down
215 changes: 117 additions & 98 deletions paddle/gserver/layers/MKLDNNFcLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
return;
}

// TODO(TJ): dst format should get from wgtVal_
int dstFmt = PARAM_FORMAT_MKLDNN_OI;
int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
if (srcFmt == dstFmt) {
return;
}

// The weight_ is transposed from initial paddle weight
MatrixPtr paddleWgt = Matrix::create(
weight_->getW()->getData(), iLayerSize_, oc_, false, false);

// TODO(TJ): remove this print when do not need differ weights
std::ostringstream ostr;
paddleWgt->print(ostr);
VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();

// The mkldnn weight is transposed from initial paddle matrix
MatrixPtr paddleWgtT;
paddleWgt->transpose(paddleWgtT, true);
weight_->getW()->copyFrom(*paddleWgtT);
weight_->getParameterPtr()->setHeaderFormat(dstFmt);
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true;
}

void MKLDNNFcLayer::convertWeightsToPaddle() {
MatrixPtr dnnWgt = weight_->getW();
MatrixPtr paddleWgt;
dnnWgt->transpose(paddleWgt, true);

// copy paddle weight and override on weight_
MatrixPtr dnnWgtT = Matrix::create(
dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
dnnWgtT->copyFrom(*paddleWgt);
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
}

void MKLDNNFcLayer::convertOutputToOtherDevice() {
copyOutputInfoToOtherDevice();
// find other cpu device and reorder output to cpu device
int cnt = 0;
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
// fc cpu output value do not need convert
// just share point
outputOtherDevice_[i].value = output_.value;
++cnt;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

对mkldnn layer来说,要转value的地方多么?如果大部分layer都是

outputOtherDevice_[i].value = output_.value;

那放进基类函数即可。需要转的layer再单独写一下,会比较清爽。

不能超过1个CPU设备的检查也应该放进基类函数中吧。而且为什么不能超过1个呢?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不是大部分都是outputOtherDevice_[i].value = output_.value;,另外的layer是需要别的操作,这里fc因为一直是nc格式的输出,与paddle的cpu device格式一样,所以直接可以share。
不过后面layer多一点之后,可以再整理一遍的。

不超过一个CPU device是理论上我认为不应该会出现多个,担心目前考虑的不周全,比如RNN的case会不会有影响,所以给一个warning。如果就算是多个,每个还是用的share。这一点也是特定在FClayer的。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果要做检查,也应该放在mkldnnLayer的convertOutputToOtherDevice里做,可以在下一个PR中修改。

}
}

if (cnt > 1) {
LOG(WARNING) << "should not have more than one CPU devie";
}
}

void MKLDNNFcLayer::reshape() {
const Argument& input = getInput(0);
const Argument& input = getInput(0, getPrev(0)->getDeviceId());
int batchSize = input.getBatchSize();
if (bs_ == batchSize) {
return;
Expand All @@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
if (iw_ == 0) {
iw_ = 1;
}
hasSpatial_ = true;
if (ih_ == 1 && iw_ == 1) {
hasSpatial_ = false;
}
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
ic_ = iLayerSize_ / (ih_ * iw_);
CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
Expand All @@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {

void MKLDNNFcLayer::resetFwd() {
bool hasBias = biases_ && biases_->getW();
real* iData = getInputValue(0)->getData();
real* oData = getOutputValue()->getData();
real* wData = weight_->getW()->getData();
real* bData = hasBias ? biases_->getW()->getData() : NULL;

// TODO(TJ): below create should be covered in MkldnnMatrix
// create memory desc
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
: createMD({bs_, ic_}, format::nc);
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi);
memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
: createMD({}, format::format_undef);
memory::desc oMD = createMD({bs_, oc_}, format::nc);

// create memory primitive desc and memory self
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
const MatrixPtr& wgt = weight_->getW();
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
const MatrixPtr& out = output_.value;

if (prevIsOnlyMKLDNN()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

名字改成inputIsOnlyMKLDNN和outputIsOnlyMKLDNN,如何?

  • 对layer来说,没有next信息
  • prev和next对应,所以都改成input/output

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以~ done.

const MatrixPtr& in = getInputValue(0);
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
CHECK(inVal_) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
inVal_ = MKLDNNMatrix::create(
in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
inVal_->downSpatial();
wgtVal_ = MKLDNNMatrix::create(
wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
wgtVal_->downSpatial();
biasVal_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);

// change original output value to mkldnn output value
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

original output value?指的是什么格式?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的original output指的是修改原先的output为一个可以case为MKLDNNMatrixPtr的指针,这样下一个如果是MKLDNN layer,是可以直接cast,并且拿到需要的信息。

output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
if (!nextIsOnlyMKLDNN()) {
convertOutputToOtherDevice();
}

// create forward handle
prop_kind pk = prop_kind::forward;
fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
: fc_fwd::desc(pk, iMD, wMD, oMD);
fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc())
: fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);

if (bData != NULL) {
biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
if (hasBias) {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
} else {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
}
printValueFormatFlow();

pipelineFwd_.clear();
pipelineFwd_.push_back(*fwd_);
}
Expand All @@ -175,45 +186,49 @@ void MKLDNNFcLayer::resetBwd() {
return;
}
needResetBwd_ = false;

bool hasBias = biases_ && biases_->getWGrad();
real* iData = getInputValue(0)->getData();
real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
real* oDiff = getOutputGrad()->getData();
real* wDiff = weight_->getWGrad()->getData();
real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;

/// backward weight
// create memory desc for backward memory
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
: createMD({bs_, ic_}, format::nc);
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi);
memory::desc oMD = createMD({bs_, oc_}, format::nc);
memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
: createMD({}, format::format_undef);

if (inVal_) {
// update data
inVal_->set_data_handle(iData);
CHECK(inVal_) << "Should have input value";
const MatrixPtr& wgt = weight_->getWGrad();
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;

// TODO(TJ): merge outgrad
if (nextIsOnlyMKLDNN()) {
// can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
// So just create from matrix with outputvalue format.
const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
} else {
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
// fc do not need to convert from cpu device since output always nc
// only need create from cpu device
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

197-207两分支也可以合并成241行的形式

int device = nextIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& out = getOutput(device).grad;
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());

另外,205行注释末尾nc是什么意思?是nc format么?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks。

是的,就是格式, 我写详细点。

}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • merge top diffs是在 layer::waitAndMergeOutputGrad中做的,这里为什么要单独写一下。
  • top diffs的叫法是caffe的,注释改一下,下同

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

merge top diffs是在 layer::waitAndMergeOutputGrad中做的,这里为什么要单独写一下。

父类的是waitAndMergeOutputGrad是专门用于ParallelNeuralNetwork的情况。不能通过,并且我这里的不是wait的,父类的是结合线程使用的。
并且我这里写的是以后需要用mkldnn::sum实现,应该不可以直接用父类的那个函数。

top diffs的叫法是caffe的,注释改一下,下同

done


// create memory primitive desc and memory self
wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
: nullptr;

fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
// create memory primitive desc
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
: fc_bwdWgt::desc(iMD, wMD, oMD);
fc_bwdWgt::desc bwdWgtDesc = hasBias
? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
biasGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc())
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdWgt::primitive_desc bwdWgtPD =
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);

if (bDiff != NULL) {
biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
if (hasBias) {
bwdWgt_.reset(
new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
} else {
Expand All @@ -223,15 +238,26 @@ void MKLDNNFcLayer::resetBwd() {
pipelineBwd_.push_back(*bwdWgt_);

/// backward data
if (iDiff == NULL) {
int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& in = getInputGrad(0, device);
if (in == nullptr) {
return;
}
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
if (getInput(0, device).getAllCount() > 1) {
// TODO(TJ): use outputMaps_ ways when merge outgrad done
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

230-249可以合并两个分支:

bool device = prevIsMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& in = getInputGrad(0,  device);
if (in == nullptr) return;
if (getInput(0, device).getAllCount() > 1) {
// TODO(TJ): use outputMaps_ ways when merge topdiff done
} else {
     inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, 但是 device不能是bool,应该是int


fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdData::primitive_desc bwdDataPD =
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));

CHECK(wgtVal_) << "Should have weight memory";
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
printGradFormatFlow();
pipelineBwd_.push_back(*bwdData_);
}

Expand All @@ -241,11 +267,7 @@ void MKLDNNFcLayer::forward(PassType passType) {

{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());

// update input data
// since it might be changed if this is after data layer
real* iData = getInputValue(0)->getData();
inVal_->set_data_handle(iData);
syncInputValue();

// just submit forward pipeline
stream_->submit(pipelineFwd_);
Expand All @@ -267,10 +289,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
resetBwd();

// update diff
real* oDiff = getOutputGrad()->getData();
outGrad_->set_data_handle(oDiff);

syncOutputGrad();
// just sumbmit backward pipeline
stream_->submit(pipelineBwd_);
}
Expand Down
Loading