-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added caching of scales for bias in conv2d int8 #36980
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -389,6 +389,49 @@ class ConvMKLDNNHandlerT | |
} | ||
} | ||
|
||
std::shared_ptr<std::tuple<float, std::vector<float>>> get_int8_bias_scales( | ||
const framework::ExecutionContext& ctx) { | ||
// Get scales int8 bias key | ||
const std::string key_bs = this->key_ + "@bs"; | ||
|
||
// Scales for int8 bias are to be cached to avoid | ||
// computing them each iteration | ||
auto bias_scale_tuple = | ||
std::static_pointer_cast<std::tuple<float, std::vector<float>>>( | ||
this->dev_ctx_.GetBlob(key_bs)); | ||
if (bias_scale_tuple) return bias_scale_tuple; | ||
|
||
const auto* filter = ctx.Input<Tensor>("Filter"); | ||
const auto& weights_tz = framework::vectorize(filter->dims()); | ||
const int groups = std::max(ctx.Attr<int>("groups"), 1); | ||
|
||
const auto& scale_weights_data = | ||
ctx.Attr<std::vector<float>>("Scale_weights"); | ||
const auto& scale_in_data = ctx.Attr<float>("Scale_in"); | ||
|
||
bool is_multi_channel = scale_weights_data.size() > 1; | ||
int mask_reorder = is_multi_channel ? 1 << 0 : 1; | ||
|
||
int count = 1; | ||
if (is_multi_channel) { | ||
count *= weights_tz[0]; | ||
if (groups > 1) { | ||
count *= weights_tz[1]; | ||
} | ||
} | ||
|
||
bias_scale_tuple = | ||
std::make_shared<std::tuple<float, std::vector<float>>>(std::make_tuple( | ||
static_cast<float>(mask_reorder), std::vector<float>(count))); | ||
for (int i = 0; i < count; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some time ago I would complain about "i++", but since I have read that compiler will optimize it anyway(this knowledge comes from the book that you have recommended about modern CPUs, so thank you!) |
||
std::get<1>(*bias_scale_tuple)[i] = scale_in_data * scale_weights_data[i]; | ||
} | ||
|
||
this->dev_ctx_.SetBlob(key_bs, bias_scale_tuple); | ||
|
||
return bias_scale_tuple; | ||
} | ||
|
||
std::tuple<float, std::vector<float>> get_int8_scales( | ||
const framework::ExecutionContext& ctx) const { | ||
const auto* filter = ctx.Input<Tensor>("Filter"); | ||
|
@@ -428,32 +471,6 @@ class ConvMKLDNNHandlerT | |
return std::make_tuple(sum_scale, output_shift_scale); | ||
} | ||
|
||
std::tuple<float, std::vector<float>> get_int8_bias_scales( | ||
const framework::ExecutionContext& ctx) const { | ||
const auto* filter = ctx.Input<Tensor>("Filter"); | ||
const auto& weights_tz = framework::vectorize(filter->dims()); | ||
const int groups = std::max(ctx.Attr<int>("groups"), 1); | ||
|
||
const auto& scale_weights_data = | ||
ctx.Attr<std::vector<float>>("Scale_weights"); | ||
const auto& scale_in_data = ctx.Attr<float>("Scale_in"); | ||
|
||
bool is_multi_channel = scale_weights_data.size() > 1; | ||
int mask_reorder = is_multi_channel ? 1 << 0 : 1; | ||
int count = | ||
is_multi_channel | ||
? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) | ||
: 1; | ||
std::vector<float> scale_bias_data(count); | ||
|
||
#pragma omp parallel for if (count > 50) | ||
for (int i = 0; i < count; i++) { | ||
scale_bias_data[i] = scale_in_data * scale_weights_data[i]; | ||
} | ||
|
||
return std::make_tuple(mask_reorder, scale_bias_data); | ||
} | ||
|
||
mkldnn::primitive_attr CreatePostOps( | ||
std::string fuse_activation, float fuse_alpha, float fuse_beta, | ||
bool fuse_residual_conn, const std::vector<float> output_shift_scale = {}, | ||
|
@@ -818,13 +835,11 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> { | |
{MKLDNN_ARG_DST, *dst_memory_p}}; | ||
|
||
if (bias) { | ||
float mask_reorder; | ||
std::vector<float> scale_bias_data; | ||
std::tie(mask_reorder, scale_bias_data) = | ||
handler.get_int8_bias_scales(ctx); | ||
auto p_scales_tuple = handler.get_int8_bias_scales(ctx); | ||
|
||
auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( | ||
bias, is_test, scale_bias_data, mask_reorder); | ||
bias, is_test, std::get<1>(*p_scales_tuple), | ||
std::get<0>(*p_scales_tuple)); | ||
args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); | ||
} | ||
|
||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the reason for bit shifting "1" zero bits? Isn't "1 << 0" equal to 1? For now that ternary operation is not needed, since no matter what value "is_multi_channel" has, the result will be equal to 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@wozna Could you please take this question?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I understand "mask_reorder" is used as flags, but even with that knowledge that doesn't seem like it's easy to understand at the first glance
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I disscussed this with @wozna and there is probably some problem here that require separate investigation (testing accuracy etc.). It will not be solved in this PR