Skip to content

Commit

Permalink
finish Encoder bw (#373)
Browse files Browse the repository at this point in the history
* encoder_bw

* format
  • Loading branch information
hexisyztem authored Sep 6, 2022
1 parent e234497 commit 33d9838
Show file tree
Hide file tree
Showing 19 changed files with 340 additions and 115 deletions.
3 changes: 0 additions & 3 deletions lightseq/csrc/kernels/cublas_wrappers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
(const void *)B, CUDA_R_32F, (transb == CUBLAS_OP_N) ? k : n,
(const void *)beta, C, CUDA_R_32F, m, CUDA_R_32F, algo);

// std::cout << transa << " " << transb << " " << m << " " << n << " " << k <<
// std::endl;

if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
Expand Down
24 changes: 20 additions & 4 deletions lightseq/csrc/kernels/cuda_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ template void check_gpu_error<cublasStatus_t>(cublasStatus_t result,

template <typename T>
void print_vec(const T *outv, std::string outn, int num_output_ele) {
std::cout << outn << ": ";
std::cout << outn << " address: " << outv << std::endl;
std::vector<T> hout(num_output_ele, (T)0);
CHECK_GPU_ERROR(cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(T),
cudaMemcpyDeviceToHost));
printf("value: ");
for (int i = 0; i < num_output_ele; i++) {
std::cout << hout[i] << ", ";
}
Expand All @@ -77,10 +78,11 @@ void print_vec(const T *outv, std::string outn, int num_output_ele) {
template <>
void print_vec<__half>(const __half *outv, std::string outn,
int num_output_ele) {
std::cout << outn << ": ";
std::cout << outn << " address: " << outv << std::endl;
std::vector<__half> hout(num_output_ele, (__half)0.f);
CHECK_GPU_ERROR(cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(__half),
cudaMemcpyDeviceToHost));
printf("value: ");
for (int i = 0; i < num_output_ele; i++) {
std::cout << __half2float(hout[i]) << ", ";
}
Expand All @@ -90,10 +92,11 @@ void print_vec<__half>(const __half *outv, std::string outn,
template <>
void print_vec<int8_t>(const int8_t *outv, std::string outn,
int num_output_ele) {
std::cout << outn << ": ";
std::cout << outn << " address: " << outv << std::endl;
std::vector<int8_t> hout(num_output_ele, 0);
cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(int8_t),
cudaMemcpyDeviceToHost);
printf("value: ");
for (int i = 0; i < num_output_ele; i++) {
std::cout << static_cast<int>(hout[i]) << ", ";
}
Expand All @@ -103,10 +106,11 @@ void print_vec<int8_t>(const int8_t *outv, std::string outn,
template <>
void print_vec<uint8_t>(const uint8_t *outv, std::string outn,
int num_output_ele) {
std::cout << outn << ": ";
std::cout << outn << " address: " << outv << std::endl;
std::vector<uint8_t> hout(num_output_ele, 0);
cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(uint8_t),
cudaMemcpyDeviceToHost);
printf("value: ");
for (int i = 0; i < num_output_ele; i++) {
std::cout << static_cast<int>(hout[i]) << ", ";
}
Expand Down Expand Up @@ -295,3 +299,15 @@ int getSMVersion() {
CHECK_GPU_ERROR(cudaGetDeviceProperties(&props, device));
return props.major * 10 + props.minor;
}

void print_time_duration(
const std::chrono::high_resolution_clock::time_point &start,
std::string duration_name, cudaStream_t stream) {
CHECK_GPU_ERROR(cudaStreamSynchronize(stream));
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
std::cout << duration_name
<< " duration time is: " << (elapsed).count() * 1000 << " ms"
<< std::endl;
return;
}
5 changes: 5 additions & 0 deletions lightseq/csrc/kernels/includes/cuda_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@ void check_2norm(const T *data_ptr, std::string tensor_name, int dsize,
cudaStream_t stream);

int getSMVersion();

/* Print run time, for debug */
void print_time_duration(
const std::chrono::high_resolution_clock::time_point &start,
std::string duration_name, cudaStream_t stream = 0);
18 changes: 16 additions & 2 deletions lightseq/csrc/lsflow/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ void Context::set_thread_context(ContextPtr context_ptr) {
void Context::remove_thread_context() { thread_context_ptr.reset(); }

void Context::add_op(Operator* op) {
if (built()) {
printf("Context has constructed! should not add new operator!\n");
exit(-1);
}

if (_layer_context.size()) {
_layer_context[0]->_op_vec.push_back(op);
return;
Expand All @@ -44,6 +49,11 @@ void Context::add_op(Operator* op) {
void Context::add_node(Node* node) { _all_node_vec.push_back(node); }

void Context::enter_layer(Layer* cur_layer, bool is_initial) {
if (built()) {
printf("Context has constructed! should not modify network\n");
exit(-1);
}

if (_layer_context.size() == 0 && is_initial == false) {
_root_layers.push_back(cur_layer);
} else if (is_initial == true) {
Expand All @@ -58,6 +68,8 @@ void Context::build() {
}
_building = true;

printf("===== start Context build =====\n");

if (!check_validate()) {
printf("Check validate error!\n");
exit(-1);
Expand All @@ -82,13 +94,13 @@ void Context::build() {
}

if (_is_training) {
for (Layer* rl : _root_layers) {
for (int idx = _root_layers.size() - 1; idx >= 0; idx--) {
Layer* rl = _root_layers[idx];
rl->backward();
}
}

cuda_free(temporary_buffer_);

_mm_ptr->calculate_buffer_();
_built = true;

Expand All @@ -99,6 +111,8 @@ void Context::build() {
#ifdef DEBUG
draw_all_context();
#endif

printf("===== finish Context build =====\n");
}

bool Context::check_validate() {
Expand Down
21 changes: 11 additions & 10 deletions lightseq/csrc/lsflow/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ void MemoryManager::calculate_buffer_() {

printf("total_consumption: %zu\n", total_consumption);

buffer_ = cuda_malloc<char>(total_consumption);
buffer_size_ = total_consumption;

for (auto iter : tensor_usages_vec) {
int unique_id = iter.first.unique_id;
tensor_ptr.emplace(unique_id, buffer_ + iter.second);
}

// Add algorithm check module
// return true means check success,
auto judge_func = [](const std::pair<TensorUsage, size_t> &x,
Expand Down Expand Up @@ -121,9 +129,10 @@ void MemoryManager::calculate_buffer_() {
#ifdef DEBUG
printf(
"idx: %d, life cycle : [%d, %d], name: %s, size: %zu, offset: %zu, "
"end_addr: %zu\n",
"end_addr: %zu, address: %p\n",
unique_id, iter.first.first_idx, iter.first.last_idx,
iter.first._name.c_str(), size, iter.second, iter.second + size);
iter.first._name.c_str(), size, iter.second, iter.second + size,
buffer_ + iter.second);
#endif

for (auto check_iter : temp_check_vec) {
Expand All @@ -150,14 +159,6 @@ void MemoryManager::calculate_buffer_() {
}
temp_check_vec.push_back(iter);
}

buffer_ = cuda_malloc<char>(total_consumption);
buffer_size_ = total_consumption;

for (auto iter : tensor_usages_vec) {
int unique_id = iter.first.unique_id;
tensor_ptr.emplace(unique_id, buffer_ + iter.second);
}
}

} // namespace lightseq
7 changes: 4 additions & 3 deletions lightseq/csrc/lsflow/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ Tensor::Tensor(std::string name, size_t size) : _id(global_tensor_id++) {
_name = name;
_size = size;
_mtype = size > 0 ? SharedMemory : FixedMemory;
_ctx_ptr = thread_context_ptr.get();
if (_mtype == SharedMemory) {
_ctx_ptr = thread_context_ptr.get();
_mm_ptr = _ctx_ptr->memory_manager_ptr();
_ctx_ptr->mx_tensor_size =
std::max(thread_context_ptr->mx_tensor_size, _size);
Expand All @@ -34,12 +34,13 @@ char* Tensor::tensor(bool is_open_interval) {
// printf("%s is null when use, plz set first!\n", _name.c_str());
// exit(-1);
// }
if (!_ctx_ptr->built() && _ptr == nullptr) {
return _ctx_ptr->temporary_buffer_;
}
return _ptr;
}
if (_ptr == nullptr) {
if (!_ctx_ptr->built()) {
// printf("tensor_name: %s, node_idx: %zu\n", _name.c_str(),
// _ctx_ptr->node_idx());
update_life_idx(_ctx_ptr->node_idx() - is_open_interval);
return _ctx_ptr->temporary_buffer_;
}
Expand Down
26 changes: 26 additions & 0 deletions lightseq/csrc/ops_new/bias_act_dropout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ void BiasActDropoutOp<T1, T2>::forward() {
} else {
throw std::runtime_error("not supported activation: " + _activation_fn);
}

#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
printf("%s forward\n", name().c_str());
print_vec(input, "input", 10);
print_vec(output, "output", 10);
print_vec((int*)mask_ptr, "mask_ptr", 10);
printf("\n");
}
#endif
}

template <typename T1, typename T2>
Expand All @@ -57,6 +68,21 @@ void BiasActDropoutOp<T1, T2>::backward() {
} else {
throw std::runtime_error("not supported activation: " + _activation_fn);
}

#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
printf("%s backward _activation_fn: %s\n", name().c_str(),
_activation_fn.c_str());
print_vec(input, "input", 10);
print_vec(bias, "bias", 10);
print_vec(grad_inp, "grad_inp", 10);
print_vec(grad_bias, "grad_bias", 10);
print_vec(grad_out, "grad_out", 10);
print_vec((int*)mask_ptr, "mask_ptr", 10);
printf("\n");
}
#endif
}

template class BiasActDropoutOp<float, float>;
Expand Down
17 changes: 15 additions & 2 deletions lightseq/csrc/ops_new/bias_dropout_residual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void BiasDropoutResOp<T1, T2>::forward() {
#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
printf("%s forward\n", name().c_str());
print_vec(residual, this->name() + " residual", 10);
print_vec(bias, this->name() + " bias", 10);
print_vec(output, this->name() + " ans", 10);
Expand All @@ -57,13 +58,25 @@ void BiasDropoutResOp<T1, T2>::backward() {
_rows, _cols, RATIO(), stream);

if (is_res_cover) { // cover
cudaMemcpy((void*)residual_grad, (void*)output_grad,
_max_ele_num * sizeof(T2), cudaMemcpyDeviceToDevice);
CHECK_GPU_ERROR(cudaMemcpyAsync((void*)residual_grad, (void*)output_grad,
_cols * _rows * sizeof(T2),
cudaMemcpyDefault, stream));
} else { // accumulate
// launch_fused_add2 ...
launch_fused_add2(residual_grad, output_grad, residual_grad, _rows, 1,
_cols, stream);
}

#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(stream);
printf("%s backward is_res_cover: %d\n", name().c_str(), is_res_cover);
print_vec(input_grad, this->name() + " input_grad", 10);
print_vec(output_grad, this->name() + " output_grad", 10);
print_vec(residual_grad, this->name() + " residual_grad", 10);
printf("\n");
}
#endif
}

template class BiasDropoutResOp<float, float>;
Expand Down
1 change: 1 addition & 0 deletions lightseq/csrc/ops_new/dropout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void DropoutOp<T1, T2>::forward() {
#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
printf("%s forward\n", name().c_str());
print_vec(input, this->name() + " inp", 10);
print_vec(output, this->name() + " out", 10);
printf("\n");
Expand Down
17 changes: 15 additions & 2 deletions lightseq/csrc/ops_new/feed_forward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,25 +41,38 @@ void FeedForwardOp<T1, T2>::backward() {
float alpha = (float)1.0, w_beta = (float)0.0, inp_beta = (float)0.0;

T2* out_grad = (T2*)child(0)->grad();
T1* weights = (T1*)parent(1)->value();
T1* input_ptr = (T1*)parent(0)->value();
T1* weights = (T1*)parent(1)->value();

T2* weights_grad = (T2*)parent(1)->grad();
T2* inp_grad = (T2*)parent(0)->grad();
T2* weights_grad = (T2*)parent(1)->grad();

if (!parent(0)->is_cover()) {
inp_beta = (float)1.0;
}

cublasHandle_t _cublasHandle = _context_ptr->get_cublashandle();

// calculate weights_grad
cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, _input_size,
_output_size, _batch_tokens, &alpha, &w_beta, input_ptr,
out_grad, weights_grad, cublasGemmAlgo_t(_gemm_algos[1]));

// calculate inp_grad
cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, _input_size,
_batch_tokens, _output_size, &alpha, &inp_beta, weights,
out_grad, inp_grad, cublasGemmAlgo_t(_gemm_algos[2]));

#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
printf("%s backward\n", name().c_str());
print_vec(inp_grad, "inp_grad", 10);
print_vec(out_grad, "out_grad", 10);
print_vec(input_ptr, "input_ptr", 10);
printf("\n");
}
#endif
}

template class FeedForwardOp<float, float>;
Expand Down
3 changes: 1 addition & 2 deletions lightseq/csrc/ops_new/includes/bias_act_dropout.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ class BiasActDropoutOp : public Operator {
ratio(r),
_activation_fn(activation_fn),
_max_ele_num(max_ele_num) {
_mask.reset(
new Tensor("BiasActDropoutOp/_mask", max_ele_num * sizeof(uint8_t)));
_mask.reset(new Tensor(name() + "/_mask", max_ele_num * sizeof(uint8_t)));
}

virtual ~BiasActDropoutOp() {}
Expand Down
2 changes: 1 addition & 1 deletion lightseq/csrc/ops_new/includes/feed_forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class FeedForwardOp : public Operator {

void backward() override;

void before_backward(int batch_tokens) { _batch_tokens = batch_tokens; }
void before_backward() {}
};

} // namespace lightseq
13 changes: 12 additions & 1 deletion lightseq/csrc/ops_new/layer_normalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ void LayerNormalizeOp<T1, T2>::forward() {
#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
print_vec(ln_res_val, name() + " ans", 10);
printf("%s forward\n", name().c_str());
print_vec(ln_res_val, "ln_res_val", 10);
printf("\n");
}
#endif
Expand Down Expand Up @@ -76,6 +77,16 @@ void LayerNormalizeOp<T1, T2>::backward() {
launch_ln_bw(gamma_grad, betta_grad, inp_grad, out_grad, residual_grad,
out_val, gamma_val, betta_val, vars_val, means_val,
_batch_tokens, _hidden_dim, streams);

#ifdef DEBUG
if (_context_ptr->built()) {
cudaStreamSynchronize(_context_ptr->get_stream());
printf("%s backward\n", name().c_str());
print_vec(inp_grad, "inp_grad", 10);
print_vec(out_grad, "out_grad", 10);
printf("\n");
}
#endif
}

template class LayerNormalizeOp<__half, __half>;
Expand Down
Loading

0 comments on commit 33d9838

Please sign in to comment.