diff --git a/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu index 240ddf407de24..3aecd3795cea2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu @@ -118,6 +118,7 @@ void SpmmPluginDynamic::cusparseLtContext::init( 4. Init algorithm selection descriptor (alg_sel) 5. Init plan descriptor (plan) */ + std::cout << "init context" << std::endl; PADDLE_ENFORCE_EQ( is_initialized, false, platform::errors::InvalidArgument( @@ -204,6 +205,7 @@ void SpmmPluginDynamic::cusparseLtContext::setAlgo(int alg) { } void SpmmPluginDynamic::cusparseLtContext::destroy() { + std::cout << "destroy context" << std::endl; PADDLE_ENFORCE_EQ(is_initialized, true, platform::errors::InvalidArgument( "cusparseLtContext is destroy before init")); @@ -217,6 +219,7 @@ void SpmmPluginDynamic::cusparseLtContext::destroy() { void SpmmPluginDynamic::cusparseLtContext::compressMatB( int n, int k, cudaDataType_t type, void* src, void** dest, size_t* compressed_size) { + std::cout << "compress matB" << std::endl; PADDLE_ENFORCE_EQ( is_initialized, false, platform::errors::InvalidArgument( @@ -268,6 +271,7 @@ SpmmPluginDynamic::SpmmPluginDynamic(const std::string& layer_name, 5. Copy the compressed weight to host 6. Convert bias precision and copy (on host) */ + std::cout << "new plugin" << std::endl; precision_size_ = getElementSize(precision); element_size_ = (precision_ == nvinfer1::DataType::kINT8 ? 4 : precision_size_); @@ -318,8 +322,14 @@ SpmmPluginDynamic::SpmmPluginDynamic(const std::string& layer_name, &compressed_size_); weight_compressed_ = new char[compressed_size_]; weight_compressed_dev_global_.reset(weight_compressed_dev_, cudaFreeFunc); + std::cout << "initial count: " << weight_compressed_dev_global_.use_count() << std::endl; cudaMemcpy(weight_compressed_, weight_compressed_dev_global_.get(), compressed_size_, cudaMemcpyDeviceToHost); + std::cout << "compressed weight:"; + for(int i=0; i<10; i++) { + std::cout << " " << static_cast(reinterpret_cast(weight_compressed_)[i]); + } + std::cout << std::endl; has_bias_ = (bias.count != 0); if (has_bias_) { @@ -368,6 +378,7 @@ SpmmPluginDynamic::SpmmPluginDynamic(const std::string& layer_name, 4. (Configured) Copy the bias to device 5. (Configured) Init cuSPARSELt descriptors */ + std::cout << "clone plugin" << std::endl; precision_size_ = getElementSize(precision); element_size_ = (precision_ == nvinfer1::DataType::kINT8 ? 4 : precision_size_); @@ -404,6 +415,7 @@ SpmmPluginDynamic::SpmmPluginDynamic(const std::string name, const void* data, weight_compressed_dev_global_(nullptr), bias_(nullptr), bias_dev_(nullptr) { + std::cout << "deserialization" << std::endl; DeserializeValue(&data, &length, &precision_); DeserializeValue(&data, &length, &precision_size_); DeserializeValue(&data, &length, &element_size_); @@ -423,11 +435,18 @@ SpmmPluginDynamic::SpmmPluginDynamic(const std::string name, const void* data, weight_compressed_ = new char[compressed_size_]; deserialize_value_size(&data, &length, weight_compressed_, compressed_size_); //MEM: how to deal with deserialization? - cudaMalloc(reinterpret_cast(weight_compressed_dev_global_.get()), + auto* p_tmp = weight_compressed_dev_global_.get(); + cudaMalloc(reinterpret_cast(&p_tmp), compressed_size_); cudaMemcpy(weight_compressed_dev_global_.get(), weight_compressed_, compressed_size_, cudaMemcpyHostToDevice); + std::cout << "compressed weight:"; + for(int i=0; i<10; i++) { + std::cout << " " << static_cast(reinterpret_cast(weight_compressed_)[i]); + } + std::cout << std::endl; + if (has_bias_) { bias_ = new float[out_dim_]; deserialize_value_size(&data, &length, bias_, sizeof(float) * out_dim_); @@ -540,6 +559,7 @@ void SpmmPluginDynamic::configurePlugin( 2. Copy the bias to device 3. Search the optimal algorithm */ + std::cout << "configure plugin" << std::endl; try { PADDLE_ENFORCE_EQ(nbInputs, 1, platform::errors::InvalidArgument( @@ -638,6 +658,7 @@ int SpmmPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const void* const* inputs, void* const* outputs, void* workSpace, cudaStream_t stream) noexcept { try { + std::cout << "enqueue" << std::endl; PADDLE_ENFORCE_EQ(is_configured_, true, platform::errors::InvalidArgument( "The plugin is not configured before enqueue")); @@ -655,16 +676,34 @@ int SpmmPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, if (inputDesc->type == nvinfer1::DataType::kFLOAT) { const auto* const input = static_cast(inputs[0]); auto* output = static_cast(outputs[0]); + auto* weight_compressed_dev_p_ = weight_compressed_dev_global_.get(); + char* test_weight = new char[compressed_size_]; + cudaMemcpy(weight_compressed_dev_global_.get(), test_weight, compressed_size_, + cudaMemcpyHostToDevice); + std::cout << "compressed weight:"; + for(int i=0; i<10; i++) { + std::cout << " " << static_cast(reinterpret_cast(weight_compressed_)[i]); + } + std::cout << std::endl; + + std::cout << "weight from shared ptr:"; + for(int i=0; i<10; i++) { + std::cout << " " << static_cast(reinterpret_cast(test_weight)[i]); + } + std::cout << std::endl; + + cusparseStatus_t status = paddle::platform::dynload::cusparseLtMatmul( &spmm_context_.handle, &spmm_context_.plan, &alpha, input, - weight_compressed_dev_global_.get(), &beta, output, output, workSpace, &stream, 1); + weight_compressed_dev_p_, &beta, output, output, workSpace, &stream, 1); return status != CUSPARSE_STATUS_SUCCESS; } else if (inputDesc->type == nvinfer1::DataType::kHALF) { const auto* const input = static_cast(inputs[0]); auto* output = static_cast(outputs[0]); + auto* weight_compressed_dev_p_ = weight_compressed_dev_global_.get(); cusparseStatus_t status = paddle::platform::dynload::cusparseLtMatmul( &spmm_context_.handle, &spmm_context_.plan, &alpha, input, - weight_compressed_dev_global_.get(), &beta, output, output, workSpace, &stream, 1); + weight_compressed_dev_p_, &beta, output, output, workSpace, &stream, 1); return status != CUSPARSE_STATUS_SUCCESS; } else if (inputDesc->type == nvinfer1::DataType::kINT8) { alpha = inputDesc->scale * weight_scale_ / outputDesc->scale; @@ -736,7 +775,7 @@ void SpmmPluginDynamic::serialize(void* buffer) const noexcept { SerializeValue(&buffer, compressed_size_); SerializeValue(&buffer, has_bias_); SerializeValue(&buffer, activation_); - + std::cout << "serialize" << std::endl; char* d = static_cast(buffer); std::copy_n(static_cast(weight_compressed_), compressed_size_, d); @@ -747,10 +786,13 @@ void SpmmPluginDynamic::serialize(void* buffer) const noexcept { } void SpmmPluginDynamic::destroy() noexcept { + std::cout << "destroy plugin" << std::endl; delete[] reinterpret_cast(weight_compressed_); //MEM: // cudaFree(weight_compressed_dev_); - weight_compressed_dev_global_.reset(); + // std::cout << "current use cout before this destroy: " << weight_compressed_dev_global_.use_count() << std::endl; + // weight_compressed_dev_global_.reset(); + std::cout << "current use cout after this destroy: " << weight_compressed_dev_global_.use_count() << std::endl; if (has_bias_) { cudaFree(bias_dev_); }