finish Encoder bw (#373)

* encoder_bw * format
bytedance · Sep 6, 2022 · 33d9838 · 33d9838
1 parent e234497
commit 33d9838
Show file tree

Hide file tree

Showing 19 changed files with 340 additions and 115 deletions.
diff --git a/lightseq/csrc/kernels/cublas_wrappers.cu b/lightseq/csrc/kernels/cublas_wrappers.cu
@@ -14,9 +14,6 @@ int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
                    (const void *)B, CUDA_R_32F, (transb == CUBLAS_OP_N) ? k : n,
                    (const void *)beta, C, CUDA_R_32F, m, CUDA_R_32F, algo);
 
-  // std::cout << transa << " " << transb << " " << m << " " << n << " " << k <<
-  // std::endl;
-
   if (status != CUBLAS_STATUS_SUCCESS) {
     fprintf(stderr,
             "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",

diff --git a/lightseq/csrc/kernels/cuda_util.cu b/lightseq/csrc/kernels/cuda_util.cu
@@ -64,10 +64,11 @@ template void check_gpu_error<cublasStatus_t>(cublasStatus_t result,
 
 template <typename T>
 void print_vec(const T *outv, std::string outn, int num_output_ele) {
-  std::cout << outn << ": ";
+  std::cout << outn << " address: " << outv << std::endl;
   std::vector<T> hout(num_output_ele, (T)0);
   CHECK_GPU_ERROR(cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(T),
                              cudaMemcpyDeviceToHost));
+  printf("value: ");
   for (int i = 0; i < num_output_ele; i++) {
     std::cout << hout[i] << ", ";
   }
@@ -77,10 +78,11 @@ void print_vec(const T *outv, std::string outn, int num_output_ele) {
 template <>
 void print_vec<__half>(const __half *outv, std::string outn,
                        int num_output_ele) {
-  std::cout << outn << ": ";
+  std::cout << outn << " address: " << outv << std::endl;
   std::vector<__half> hout(num_output_ele, (__half)0.f);
   CHECK_GPU_ERROR(cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(__half),
                              cudaMemcpyDeviceToHost));
+  printf("value: ");
   for (int i = 0; i < num_output_ele; i++) {
     std::cout << __half2float(hout[i]) << ", ";
   }
@@ -90,10 +92,11 @@ void print_vec<__half>(const __half *outv, std::string outn,
 template <>
 void print_vec<int8_t>(const int8_t *outv, std::string outn,
                        int num_output_ele) {
-  std::cout << outn << ": ";
+  std::cout << outn << " address: " << outv << std::endl;
   std::vector<int8_t> hout(num_output_ele, 0);
   cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(int8_t),
              cudaMemcpyDeviceToHost);
+  printf("value: ");
   for (int i = 0; i < num_output_ele; i++) {
     std::cout << static_cast<int>(hout[i]) << ", ";
   }
@@ -103,10 +106,11 @@ void print_vec<int8_t>(const int8_t *outv, std::string outn,
 template <>
 void print_vec<uint8_t>(const uint8_t *outv, std::string outn,
                         int num_output_ele) {
-  std::cout << outn << ": ";
+  std::cout << outn << " address: " << outv << std::endl;
   std::vector<uint8_t> hout(num_output_ele, 0);
   cudaMemcpy(hout.data(), outv, num_output_ele * sizeof(uint8_t),
              cudaMemcpyDeviceToHost);
+  printf("value: ");
   for (int i = 0; i < num_output_ele; i++) {
     std::cout << static_cast<int>(hout[i]) << ", ";
   }
@@ -295,3 +299,15 @@ int getSMVersion() {
   CHECK_GPU_ERROR(cudaGetDeviceProperties(&props, device));
   return props.major * 10 + props.minor;
 }
+
+void print_time_duration(
+    const std::chrono::high_resolution_clock::time_point &start,
+    std::string duration_name, cudaStream_t stream) {
+  CHECK_GPU_ERROR(cudaStreamSynchronize(stream));
+  auto finish = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = finish - start;
+  std::cout << duration_name
+            << " duration time is: " << (elapsed).count() * 1000 << " ms"
+            << std::endl;
+  return;
+}
diff --git a/lightseq/csrc/kernels/includes/cuda_util.h b/lightseq/csrc/kernels/includes/cuda_util.h
@@ -46,3 +46,8 @@ void check_2norm(const T *data_ptr, std::string tensor_name, int dsize,
                  cudaStream_t stream);
 
 int getSMVersion();
+
+/* Print run time, for debug */
+void print_time_duration(
+    const std::chrono::high_resolution_clock::time_point &start,
+    std::string duration_name, cudaStream_t stream = 0);
diff --git a/lightseq/csrc/lsflow/context.cpp b/lightseq/csrc/lsflow/context.cpp
@@ -29,6 +29,11 @@ void Context::set_thread_context(ContextPtr context_ptr) {
 void Context::remove_thread_context() { thread_context_ptr.reset(); }
 
 void Context::add_op(Operator* op) {
+  if (built()) {
+    printf("Context has constructed! should not add new operator!\n");
+    exit(-1);
+  }
+
   if (_layer_context.size()) {
     _layer_context[0]->_op_vec.push_back(op);
     return;
@@ -44,6 +49,11 @@ void Context::add_op(Operator* op) {
 void Context::add_node(Node* node) { _all_node_vec.push_back(node); }
 
 void Context::enter_layer(Layer* cur_layer, bool is_initial) {
+  if (built()) {
+    printf("Context has constructed! should not modify network\n");
+    exit(-1);
+  }
+
   if (_layer_context.size() == 0 && is_initial == false) {
     _root_layers.push_back(cur_layer);
   } else if (is_initial == true) {
@@ -58,6 +68,8 @@ void Context::build() {
   }
   _building = true;
 
+  printf("===== start Context build =====\n");
+
   if (!check_validate()) {
     printf("Check validate error!\n");
     exit(-1);
@@ -82,13 +94,13 @@ void Context::build() {
   }
 
   if (_is_training) {
-    for (Layer* rl : _root_layers) {
+    for (int idx = _root_layers.size() - 1; idx >= 0; idx--) {
+      Layer* rl = _root_layers[idx];
       rl->backward();
     }
   }
 
   cuda_free(temporary_buffer_);
-
   _mm_ptr->calculate_buffer_();
   _built = true;
 
@@ -99,6 +111,8 @@ void Context::build() {
 #ifdef DEBUG
   draw_all_context();
 #endif
+
+  printf("===== finish Context build =====\n");
 }
 
 bool Context::check_validate() {

diff --git a/lightseq/csrc/lsflow/manager.cpp b/lightseq/csrc/lsflow/manager.cpp
@@ -91,6 +91,14 @@ void MemoryManager::calculate_buffer_() {
 
   printf("total_consumption: %zu\n", total_consumption);
 
+  buffer_ = cuda_malloc<char>(total_consumption);
+  buffer_size_ = total_consumption;
+
+  for (auto iter : tensor_usages_vec) {
+    int unique_id = iter.first.unique_id;
+    tensor_ptr.emplace(unique_id, buffer_ + iter.second);
+  }
+
   // Add algorithm check module
   // return true means check success,
   auto judge_func = [](const std::pair<TensorUsage, size_t> &x,
@@ -121,9 +129,10 @@ void MemoryManager::calculate_buffer_() {
 #ifdef DEBUG
     printf(
         "idx: %d, life cycle : [%d, %d], name: %s, size: %zu, offset: %zu, "
-        "end_addr: %zu\n",
+        "end_addr: %zu, address: %p\n",
         unique_id, iter.first.first_idx, iter.first.last_idx,
-        iter.first._name.c_str(), size, iter.second, iter.second + size);
+        iter.first._name.c_str(), size, iter.second, iter.second + size,
+        buffer_ + iter.second);
 #endif
 
     for (auto check_iter : temp_check_vec) {
@@ -150,14 +159,6 @@ void MemoryManager::calculate_buffer_() {
     }
     temp_check_vec.push_back(iter);
   }
-
-  buffer_ = cuda_malloc<char>(total_consumption);
-  buffer_size_ = total_consumption;
-
-  for (auto iter : tensor_usages_vec) {
-    int unique_id = iter.first.unique_id;
-    tensor_ptr.emplace(unique_id, buffer_ + iter.second);
-  }
 }
 
 }  // namespace lightseq
diff --git a/lightseq/csrc/lsflow/tensor.cpp b/lightseq/csrc/lsflow/tensor.cpp
@@ -6,8 +6,8 @@ Tensor::Tensor(std::string name, size_t size) : _id(global_tensor_id++) {
   _name = name;
   _size = size;
   _mtype = size > 0 ? SharedMemory : FixedMemory;
+  _ctx_ptr = thread_context_ptr.get();
   if (_mtype == SharedMemory) {
-    _ctx_ptr = thread_context_ptr.get();
     _mm_ptr = _ctx_ptr->memory_manager_ptr();
     _ctx_ptr->mx_tensor_size =
         std::max(thread_context_ptr->mx_tensor_size, _size);
@@ -34,12 +34,13 @@ char* Tensor::tensor(bool is_open_interval) {
     //   printf("%s is null when use, plz set first!\n", _name.c_str());
     //   exit(-1);
     // }
+    if (!_ctx_ptr->built() && _ptr == nullptr) {
+      return _ctx_ptr->temporary_buffer_;
+    }
     return _ptr;
   }
   if (_ptr == nullptr) {
     if (!_ctx_ptr->built()) {
-      // printf("tensor_name: %s, node_idx: %zu\n", _name.c_str(),
-      // _ctx_ptr->node_idx());
       update_life_idx(_ctx_ptr->node_idx() - is_open_interval);
       return _ctx_ptr->temporary_buffer_;
     }

diff --git a/lightseq/csrc/ops_new/bias_act_dropout.cpp b/lightseq/csrc/ops_new/bias_act_dropout.cpp
@@ -31,6 +31,17 @@ void BiasActDropoutOp<T1, T2>::forward() {
   } else {
     throw std::runtime_error("not supported activation: " + _activation_fn);
   }
+
+#ifdef DEBUG
+  if (_context_ptr->built()) {
+    cudaStreamSynchronize(_context_ptr->get_stream());
+    printf("%s forward\n", name().c_str());
+    print_vec(input, "input", 10);
+    print_vec(output, "output", 10);
+    print_vec((int*)mask_ptr, "mask_ptr", 10);
+    printf("\n");
+  }
+#endif
 }
 
 template <typename T1, typename T2>
@@ -57,6 +68,21 @@ void BiasActDropoutOp<T1, T2>::backward() {
   } else {
     throw std::runtime_error("not supported activation: " + _activation_fn);
   }
+
+#ifdef DEBUG
+  if (_context_ptr->built()) {
+    cudaStreamSynchronize(_context_ptr->get_stream());
+    printf("%s backward _activation_fn: %s\n", name().c_str(),
+           _activation_fn.c_str());
+    print_vec(input, "input", 10);
+    print_vec(bias, "bias", 10);
+    print_vec(grad_inp, "grad_inp", 10);
+    print_vec(grad_bias, "grad_bias", 10);
+    print_vec(grad_out, "grad_out", 10);
+    print_vec((int*)mask_ptr, "mask_ptr", 10);
+    printf("\n");
+  }
+#endif
 }
 
 template class BiasActDropoutOp<float, float>;

diff --git a/lightseq/csrc/ops_new/bias_dropout_residual.cpp b/lightseq/csrc/ops_new/bias_dropout_residual.cpp
@@ -31,6 +31,7 @@ void BiasDropoutResOp<T1, T2>::forward() {
 #ifdef DEBUG
   if (_context_ptr->built()) {
     cudaStreamSynchronize(_context_ptr->get_stream());
+    printf("%s forward\n", name().c_str());
     print_vec(residual, this->name() + " residual", 10);
     print_vec(bias, this->name() + " bias", 10);
     print_vec(output, this->name() + " ans", 10);
@@ -57,13 +58,25 @@ void BiasDropoutResOp<T1, T2>::backward() {
                                  _rows, _cols, RATIO(), stream);
 
   if (is_res_cover) {  // cover
-    cudaMemcpy((void*)residual_grad, (void*)output_grad,
-               _max_ele_num * sizeof(T2), cudaMemcpyDeviceToDevice);
+    CHECK_GPU_ERROR(cudaMemcpyAsync((void*)residual_grad, (void*)output_grad,
+                                    _cols * _rows * sizeof(T2),
+                                    cudaMemcpyDefault, stream));
   } else {  // accumulate
             // launch_fused_add2 ...
     launch_fused_add2(residual_grad, output_grad, residual_grad, _rows, 1,
                       _cols, stream);
   }
+
+#ifdef DEBUG
+  if (_context_ptr->built()) {
+    cudaStreamSynchronize(stream);
+    printf("%s backward is_res_cover: %d\n", name().c_str(), is_res_cover);
+    print_vec(input_grad, this->name() + " input_grad", 10);
+    print_vec(output_grad, this->name() + " output_grad", 10);
+    print_vec(residual_grad, this->name() + " residual_grad", 10);
+    printf("\n");
+  }
+#endif
 }
 
 template class BiasDropoutResOp<float, float>;

diff --git a/lightseq/csrc/ops_new/dropout.cpp b/lightseq/csrc/ops_new/dropout.cpp
@@ -26,6 +26,7 @@ void DropoutOp<T1, T2>::forward() {
 #ifdef DEBUG
   if (_context_ptr->built()) {
     cudaStreamSynchronize(_context_ptr->get_stream());
+    printf("%s forward\n", name().c_str());
     print_vec(input, this->name() + " inp", 10);
     print_vec(output, this->name() + " out", 10);
     printf("\n");

diff --git a/lightseq/csrc/ops_new/feed_forward.cpp b/lightseq/csrc/ops_new/feed_forward.cpp
@@ -41,25 +41,38 @@ void FeedForwardOp<T1, T2>::backward() {
   float alpha = (float)1.0, w_beta = (float)0.0, inp_beta = (float)0.0;
 
   T2* out_grad = (T2*)child(0)->grad();
-  T1* weights = (T1*)parent(1)->value();
   T1* input_ptr = (T1*)parent(0)->value();
+  T1* weights = (T1*)parent(1)->value();
 
-  T2* weights_grad = (T2*)parent(1)->grad();
   T2* inp_grad = (T2*)parent(0)->grad();
+  T2* weights_grad = (T2*)parent(1)->grad();
 
   if (!parent(0)->is_cover()) {
     inp_beta = (float)1.0;
   }
 
   cublasHandle_t _cublasHandle = _context_ptr->get_cublashandle();
 
+  // calculate weights_grad
   cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, _input_size,
                  _output_size, _batch_tokens, &alpha, &w_beta, input_ptr,
                  out_grad, weights_grad, cublasGemmAlgo_t(_gemm_algos[1]));
 
+  // calculate inp_grad
   cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, _input_size,
                  _batch_tokens, _output_size, &alpha, &inp_beta, weights,
                  out_grad, inp_grad, cublasGemmAlgo_t(_gemm_algos[2]));
+
+#ifdef DEBUG
+  if (_context_ptr->built()) {
+    cudaStreamSynchronize(_context_ptr->get_stream());
+    printf("%s backward\n", name().c_str());
+    print_vec(inp_grad, "inp_grad", 10);
+    print_vec(out_grad, "out_grad", 10);
+    print_vec(input_ptr, "input_ptr", 10);
+    printf("\n");
+  }
+#endif
 }
 
 template class FeedForwardOp<float, float>;

diff --git a/lightseq/csrc/ops_new/includes/bias_act_dropout.h b/lightseq/csrc/ops_new/includes/bias_act_dropout.h
@@ -27,8 +27,7 @@ class BiasActDropoutOp : public Operator {
         ratio(r),
         _activation_fn(activation_fn),
         _max_ele_num(max_ele_num) {
-    _mask.reset(
-        new Tensor("BiasActDropoutOp/_mask", max_ele_num * sizeof(uint8_t)));
+    _mask.reset(new Tensor(name() + "/_mask", max_ele_num * sizeof(uint8_t)));
   }
 
   virtual ~BiasActDropoutOp() {}

diff --git a/lightseq/csrc/ops_new/includes/feed_forward.h b/lightseq/csrc/ops_new/includes/feed_forward.h
@@ -33,7 +33,7 @@ class FeedForwardOp : public Operator {
 
   void backward() override;
 
-  void before_backward(int batch_tokens) { _batch_tokens = batch_tokens; }
+  void before_backward() {}
 };
 
 }  // namespace lightseq
diff --git a/lightseq/csrc/ops_new/layer_normalize.cpp b/lightseq/csrc/ops_new/layer_normalize.cpp
@@ -39,7 +39,8 @@ void LayerNormalizeOp<T1, T2>::forward() {
 #ifdef DEBUG
   if (_context_ptr->built()) {
     cudaStreamSynchronize(_context_ptr->get_stream());
-    print_vec(ln_res_val, name() + " ans", 10);
+    printf("%s forward\n", name().c_str());
+    print_vec(ln_res_val, "ln_res_val", 10);
     printf("\n");
   }
 #endif
@@ -76,6 +77,16 @@ void LayerNormalizeOp<T1, T2>::backward() {
   launch_ln_bw(gamma_grad, betta_grad, inp_grad, out_grad, residual_grad,
                out_val, gamma_val, betta_val, vars_val, means_val,
                _batch_tokens, _hidden_dim, streams);
+
+#ifdef DEBUG
+  if (_context_ptr->built()) {
+    cudaStreamSynchronize(_context_ptr->get_stream());
+    printf("%s backward\n", name().c_str());
+    print_vec(inp_grad, "inp_grad", 10);
+    print_vec(out_grad, "out_grad", 10);
+    printf("\n");
+  }
+#endif
 }
 
 template class LayerNormalizeOp<__half, __half>;