-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add MKLDNN_DEVICE #3712
add MKLDNN_DEVICE #3712
Changes from 12 commits
4d8992c
462b9b1
62e6dac
4bffbd3
4eecd0c
48d87e5
780c8d9
4cc5783
98b7c67
2efac83
fe51f72
bfbd066
c5183ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { | |
return; | ||
} | ||
|
||
// TODO(TJ): dst format should get from wgtVal_ | ||
int dstFmt = PARAM_FORMAT_MKLDNN_OI; | ||
int srcFmt = weight_->getParameterPtr()->getHeaderFormat(); | ||
if (srcFmt == dstFmt) { | ||
return; | ||
} | ||
|
||
// The weight_ is transposed from initial paddle weight | ||
MatrixPtr paddleWgt = Matrix::create( | ||
weight_->getW()->getData(), iLayerSize_, oc_, false, false); | ||
|
||
// TODO(TJ): remove this print when do not need differ weights | ||
std::ostringstream ostr; | ||
paddleWgt->print(ostr); | ||
VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str(); | ||
|
||
// The mkldnn weight is transposed from initial paddle matrix | ||
MatrixPtr paddleWgtT; | ||
paddleWgt->transpose(paddleWgtT, true); | ||
weight_->getW()->copyFrom(*paddleWgtT); | ||
weight_->getParameterPtr()->setHeaderFormat(dstFmt); | ||
CHECK(wgtVal_) << "should have been initialized"; | ||
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; | ||
auto targetDim = wgtVal_->getDims(); | ||
auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; | ||
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim); | ||
hasInitedWgt_ = true; | ||
} | ||
|
||
void MKLDNNFcLayer::convertWeightsToPaddle() { | ||
MatrixPtr dnnWgt = weight_->getW(); | ||
MatrixPtr paddleWgt; | ||
dnnWgt->transpose(paddleWgt, true); | ||
|
||
// copy paddle weight and override on weight_ | ||
MatrixPtr dnnWgtT = Matrix::create( | ||
dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false); | ||
dnnWgtT->copyFrom(*paddleWgt); | ||
CHECK(wgtVal_) << "should have been initialized"; | ||
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; | ||
auto targetDim = wgtVal_->getDims(); | ||
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; | ||
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim); | ||
} | ||
|
||
void MKLDNNFcLayer::convertOutputToOtherDevice() { | ||
copyOutputInfoToOtherDevice(); | ||
// find other cpu device and reorder output to cpu device | ||
int cnt = 0; | ||
for (size_t i = 0; i < outputOtherDevice_.size(); i++) { | ||
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) { | ||
// fc cpu output value do not need convert | ||
// just share point | ||
outputOtherDevice_[i].value = output_.value; | ||
++cnt; | ||
} | ||
} | ||
|
||
if (cnt > 1) { | ||
LOG(WARNING) << "should not have more than one CPU devie"; | ||
} | ||
} | ||
|
||
void MKLDNNFcLayer::reshape() { | ||
const Argument& input = getInput(0); | ||
const Argument& input = getInput(0, getPrev(0)->getDeviceId()); | ||
int batchSize = input.getBatchSize(); | ||
if (bs_ == batchSize) { | ||
return; | ||
|
@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() { | |
if (iw_ == 0) { | ||
iw_ = 1; | ||
} | ||
hasSpatial_ = true; | ||
if (ih_ == 1 && iw_ == 1) { | ||
hasSpatial_ = false; | ||
} | ||
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize()); | ||
ic_ = iLayerSize_ / (ih_ * iw_); | ||
CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible"; | ||
|
@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() { | |
|
||
void MKLDNNFcLayer::resetFwd() { | ||
bool hasBias = biases_ && biases_->getW(); | ||
real* iData = getInputValue(0)->getData(); | ||
real* oData = getOutputValue()->getData(); | ||
real* wData = weight_->getW()->getData(); | ||
real* bData = hasBias ? biases_->getW()->getData() : NULL; | ||
|
||
// TODO(TJ): below create should be covered in MkldnnMatrix | ||
// create memory desc | ||
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) | ||
: createMD({bs_, ic_}, format::nc); | ||
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw) | ||
: createMD({oc_, ic_}, format::oi); | ||
memory::desc bMD = bData != NULL ? createMD({oc_}, format::x) | ||
: createMD({}, format::format_undef); | ||
memory::desc oMD = createMD({bs_, oc_}, format::nc); | ||
|
||
// create memory primitive desc and memory self | ||
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); | ||
wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData)); | ||
outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData)); | ||
const MatrixPtr& wgt = weight_->getW(); | ||
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr; | ||
const MatrixPtr& out = output_.value; | ||
|
||
if (prevIsOnlyMKLDNN()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 名字改成inputIsOnlyMKLDNN和outputIsOnlyMKLDNN,如何?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 可以~ done. |
||
const MatrixPtr& in = getInputValue(0); | ||
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in); | ||
CHECK(inVal_) << "Input should be MKLDNNMatrix"; | ||
} else { | ||
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; | ||
const MatrixPtr& in = getInputValue(0, CPU_DEVICE); | ||
inVal_ = MKLDNNMatrix::create( | ||
in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_); | ||
} | ||
inVal_->downSpatial(); | ||
wgtVal_ = MKLDNNMatrix::create( | ||
wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_); | ||
wgtVal_->downSpatial(); | ||
biasVal_ = | ||
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr; | ||
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_); | ||
|
||
// change original output value to mkldnn output value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. original output value?指的是什么格式? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里的 |
||
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_); | ||
if (!nextIsOnlyMKLDNN()) { | ||
convertOutputToOtherDevice(); | ||
} | ||
|
||
// create forward handle | ||
prop_kind pk = prop_kind::forward; | ||
fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD) | ||
: fc_fwd::desc(pk, iMD, wMD, oMD); | ||
fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk, | ||
inVal_->getMemoryDesc(), | ||
wgtVal_->getMemoryDesc(), | ||
biasVal_->getMemoryDesc(), | ||
outVal_->getMemoryDesc()) | ||
: fc_fwd::desc(pk, | ||
inVal_->getMemoryDesc(), | ||
wgtVal_->getMemoryDesc(), | ||
outVal_->getMemoryDesc()); | ||
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); | ||
|
||
if (bData != NULL) { | ||
biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData)); | ||
if (hasBias) { | ||
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_)); | ||
} else { | ||
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_)); | ||
} | ||
printValueFormatFlow(); | ||
|
||
pipelineFwd_.clear(); | ||
pipelineFwd_.push_back(*fwd_); | ||
} | ||
|
@@ -175,45 +186,49 @@ void MKLDNNFcLayer::resetBwd() { | |
return; | ||
} | ||
needResetBwd_ = false; | ||
|
||
bool hasBias = biases_ && biases_->getWGrad(); | ||
real* iData = getInputValue(0)->getData(); | ||
real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL; | ||
real* oDiff = getOutputGrad()->getData(); | ||
real* wDiff = weight_->getWGrad()->getData(); | ||
real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL; | ||
|
||
/// backward weight | ||
// create memory desc for backward memory | ||
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) | ||
: createMD({bs_, ic_}, format::nc); | ||
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw) | ||
: createMD({oc_, ic_}, format::oi); | ||
memory::desc oMD = createMD({bs_, oc_}, format::nc); | ||
memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x) | ||
: createMD({}, format::format_undef); | ||
|
||
if (inVal_) { | ||
// update data | ||
inVal_->set_data_handle(iData); | ||
CHECK(inVal_) << "Should have input value"; | ||
const MatrixPtr& wgt = weight_->getWGrad(); | ||
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr; | ||
|
||
// TODO(TJ): merge outgrad | ||
if (nextIsOnlyMKLDNN()) { | ||
// can not directly cast outputgrad to mkldnnmatrix, | ||
// since each layer can not write the inputgrad to mkldnn inputgrad. | ||
// So just create from matrix with outputvalue format. | ||
const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad; | ||
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc()); | ||
} else { | ||
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); | ||
const MatrixPtr& out = getOutput(CPU_DEVICE).grad; | ||
// fc do not need to convert from cpu device since output always nc | ||
// only need create from cpu device | ||
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 197-207两分支也可以合并成241行的形式
另外,205行注释末尾nc是什么意思?是nc format么? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK thanks。 是的,就是格式, 我写详细点。 |
||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
父类的是
done |
||
|
||
// create memory primitive desc and memory self | ||
wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff)); | ||
outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff)); | ||
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc()); | ||
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc()) | ||
: nullptr; | ||
|
||
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD); | ||
// create memory primitive desc | ||
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, | ||
inVal_->getMemoryDesc(), | ||
wgtGrad_->getMemoryDesc(), | ||
outGrad_->getMemoryDesc()); | ||
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); | ||
fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL | ||
? fc_bwdWgt::desc(iMD, wMD, bMD, oMD) | ||
: fc_bwdWgt::desc(iMD, wMD, oMD); | ||
fc_bwdWgt::desc bwdWgtDesc = hasBias | ||
? fc_bwdWgt::desc(inVal_->getMemoryDesc(), | ||
wgtGrad_->getMemoryDesc(), | ||
biasGrad_->getMemoryDesc(), | ||
outGrad_->getMemoryDesc()) | ||
: fc_bwdWgt::desc(inVal_->getMemoryDesc(), | ||
wgtGrad_->getMemoryDesc(), | ||
outGrad_->getMemoryDesc()); | ||
fc_bwdWgt::primitive_desc bwdWgtPD = | ||
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD); | ||
|
||
if (bDiff != NULL) { | ||
biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff)); | ||
if (hasBias) { | ||
bwdWgt_.reset( | ||
new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_)); | ||
} else { | ||
|
@@ -223,15 +238,26 @@ void MKLDNNFcLayer::resetBwd() { | |
pipelineBwd_.push_back(*bwdWgt_); | ||
|
||
/// backward data | ||
if (iDiff == NULL) { | ||
int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE; | ||
const MatrixPtr& in = getInputGrad(0, device); | ||
if (in == nullptr) { | ||
return; | ||
} | ||
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD); | ||
if (getInput(0, device).getAllCount() > 1) { | ||
// TODO(TJ): use outputMaps_ ways when merge outgrad done | ||
} else { | ||
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc()); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 230-249可以合并两个分支:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, 但是 device不能是 |
||
|
||
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(), | ||
wgtGrad_->getMemoryDesc(), | ||
outGrad_->getMemoryDesc()); | ||
fc_bwdData::primitive_desc bwdDataPD = | ||
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD); | ||
inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff)); | ||
|
||
CHECK(wgtVal_) << "Should have weight memory"; | ||
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_)); | ||
printGradFormatFlow(); | ||
pipelineBwd_.push_back(*bwdData_); | ||
} | ||
|
||
|
@@ -241,11 +267,7 @@ void MKLDNNFcLayer::forward(PassType passType) { | |
|
||
{ | ||
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); | ||
|
||
// update input data | ||
// since it might be changed if this is after data layer | ||
real* iData = getInputValue(0)->getData(); | ||
inVal_->set_data_handle(iData); | ||
syncInputValue(); | ||
|
||
// just submit forward pipeline | ||
stream_->submit(pipelineFwd_); | ||
|
@@ -267,10 +289,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) { | |
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); | ||
resetBwd(); | ||
|
||
// update diff | ||
real* oDiff = getOutputGrad()->getData(); | ||
outGrad_->set_data_handle(oDiff); | ||
|
||
syncOutputGrad(); | ||
// just sumbmit backward pipeline | ||
stream_->submit(pipelineBwd_); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
对mkldnn layer来说,要转value的地方多么?如果大部分layer都是
那放进基类函数即可。需要转的layer再单独写一下,会比较清爽。
不能超过1个CPU设备的检查也应该放进基类函数中吧。而且为什么不能超过1个呢?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
不是大部分都是
outputOtherDevice_[i].value = output_.value;
,另外的layer是需要别的操作,这里fc因为一直是nc格式的输出,与paddle的cpu device格式一样,所以直接可以share。不过后面layer多一点之后,可以再整理一遍的。
不超过一个CPU device是理论上我认为不应该会出现多个,担心目前考虑的不周全,比如RNN的case会不会有影响,所以给一个warning。如果就算是多个,每个还是用的share。这一点也是特定在FClayer的。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
如果要做检查,也应该放在mkldnnLayer的convertOutputToOtherDevice里做,可以在下一个PR中修改。