Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ncnnqat的环境问题 #1

FitzShen666 opened this issue May 28, 2022 · 6 comments

ncnnqat的环境问题 #1

FitzShen666 opened this issue May 28, 2022 · 6 comments


Copy link

在使用pip install ncnnqat的时候会产生如下报错:
File "", line 3, in
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
ModuleNotFoundError: No module named 'torch'



make[1]: Entering directory '/root/ncnnqat'
NVCC src/
nvcc -std=c++14 -ccbin=g++ -Xcompiler -fPIC -use_fast_math -DNDEBUG -O3 -I./ -I/usr/local/cuda/include -I/opt/conda/include/python3.7m -I/opt/conda/lib/python3.7/site-packages/torch/include -I/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.7/site-packages/torch/include/TH -I/opt/conda/lib/python3.7/site-packages/torch/include/THC -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=0 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -M src/ -o obj/cuda/fake_quantize.d
-odir obj/cuda
nvcc -std=c++14 -ccbin=g++ -Xcompiler -fPIC -use_fast_math -DNDEBUG -O3 -I./ -I/usr/local/cuda/include -I/opt/conda/include/python3.7m -I/opt/conda/lib/python3.7/site-packages/torch/include -I/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.7/site-packages/torch/include/TH -I/opt/conda/lib/python3.7/site-packages/torch/include/THC -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=0 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -c src/ -o obj/cuda/fake_quantize.o
/opt/conda/lib/python3.7/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "visibility" does not apply here

/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "visibility" does not apply here

/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "visibility" does not apply here

src/ error: a value of type "const float *" cannot be assigned to an entity of type "float *"

src/ error: identifier "Row" is undefined

src/ warning: variable "momenta" was declared but never referenced

/opt/conda/lib/python3.7/site-packages/torch/include/c10/util/TypeCast.h(27): warning: calling a constexpr host function("real") from a host device function("apply") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this.
detected during:
instantiation of "decltype(auto) c10::maybe_real<true, src_t>::apply(src_t) [with src_t=c10::complex]"
(57): here
instantiation of "uint8_t c10::static_cast_with_inter_type<uint8_t, src_t>::apply(src_t) [with src_t=c10::complex]"
(166): here
instantiation of "To c10::convert<To,From>(From) [with To=uint8_t, From=c10::complex]"
(178): here
instantiation of "To c10::checked_convert<To,From>(From, const char *) [with To=uint8_t, From=c10::complex]"
/opt/conda/lib/python3.7/site-packages/torch/include/c10/core/Scalar.h(66): here

2 errors detected in the compilation of "/tmp/tmpxft_00000066_00000000-11_fake_quantize.compute_75.cpp1.ii".
Makefile:70: recipe for target 'obj/cuda/fake_quantize.o' failed
make[1]: *** [obj/cuda/fake_quantize.o] Error 1
make[1]: Leaving directory '/root/ncnnqat'
running install
running bdist_egg
running egg_info
writing ncnnqat.egg-info/PKG-INFO
writing dependency_links to ncnnqat.egg-info/dependency_links.txt
writing requirements to ncnnqat.egg-info/requires.txt
writing top-level names to ncnnqat.egg-info/top_level.txt
reading manifest file 'ncnnqat.egg-info/SOURCES.txt'
reading manifest template ''
writing manifest file 'ncnnqat.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
running build_ext
building 'quant_cuda' extension
Emitting ninja build file /root/ncnnqat/build/temp.linux-x86_64-3.7/
Compiling objects...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ /root/ncnnqat/build/temp.linux-x86_64-3.7/./src/fake_quantize.o -Lobj -L/opt/conda/lib/python3.7/site-packages/torch/lib -L/usr/local/cuda/lib64 -lquant_cuda -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.7/
/opt/conda/compiler_compat/ld: cannot find -lquant_cuda
collect2: error: ld returned 1 exit status
error: command 'g++' failed with exit status 1
Makefile:106: recipe for target 'install' failed
make: *** [install] Error 1


Copy link

yolunghiu commented Aug 18, 2023 via email

Copy link


#include "fake_quantize.h"

global void max_reduce(float* restrict data, float* out_ptr, int width, int lg_n)
shared float middleware[blockSize];
const float min_positive_float = 1e-6;
int row = blockIdx.x * width + threadIdx.x;
int bid = blockIdx.x;
int tid = threadIdx.x;
int tid_tmp = threadIdx.x;

if (tid < width)
    middleware[tid] = data[row];
    middleware[tid] = min_positive_float;
row += blockSize;
tid_tmp += blockSize;
while (tid_tmp < width)
    if (fabs(data[row]) > fabs(middleware[tid]))
        middleware[tid] = data[row];
    row += blockSize;
    tid_tmp += blockSize;

for (int i = lg_n / 2; i > 0; i /= 2)
    if (tid < i)
        if (fabs(middleware[tid + i]) > fabs(middleware[tid]))
            middleware[tid] = middleware[tid + i];

if (tid == 0)
    out_ptr[bid] = fabs(middleware[0]);


global void fake_quantize_layer_google(float* restrict a,
float* o,
float* o1,
float* mov_max,
int size,
int bit_width,
float* max_entry)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size)
const float momenta = 0.95;
float mov_max_tmp = mov_max[0];
if(mov_max_tmp<1e-6) mov_max_tmp=fabs(max_entry); //movMax dafault 0 ,now first step set it a non zero data
else mov_max_tmp= mov_max_tmp * momenta + fabs(max_entry) * (1.-momenta); // #GOOGLE QAT : movMax = movMaxmomenta + max(abs(tensor))
(1-momenta) momenta = 0.95
float data_scale = __powf(2.,bit_width-1.)-1;

float scale;
    if(mov_max_tmp < 1e-6) scale =  __fdividef(data_scale,1e-6);
else scale =  __fdividef(data_scale,mov_max_tmp);

int o_int = round(a[index]*scale);
//o[index] = __fdividef(round(a[index]*scale),scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] =  __fdividef(o_int*1.,scale);

    o1[0] = scale;
    mov_max[0] = mov_max_tmp;


global void fake_quantize_layer_aciq(float* restrict a,
float* o,
float* o1,
float* mov_max,
int feature_pixl_num,
int size,
int bit_width,
float* max_entry)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size)
const float momenta = 0.95;
float mov_max_tmp = mov_max[0];
if(mov_max_tmp<1e-6) mov_max_tmp=fabs(*max_entry); //movMax dafault 0 ,now first step set it a non zero data
else mov_max_tmp= fabs(max_entry);//mov_max_tmp * momenta + fabs(max_entry) * (1.-momenta); // #GOOGLE QAT : movMax = movMaxmomenta + max(abs(tensor))(1-momenta) momenta = 0.95
float data_scale = __powf(2.,bit_width-1.)-1;

    const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
    const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * __logf(4.)));
    double std = (mov_max_tmp * 2 * gaussian_const) / sqrt(2 * __logf(feature_pixl_num));
    float threshold = (float)(alpha_gaussian[bit_width - 1] * std);
    float scale;
    if(threshold < 1e-6) scale =  __fdividef(data_scale,1e-6);
    else scale =  __fdividef(data_scale,threshold);
//float o_index = __fdividef(round(a[index]*scale),scale);
int o_int = round(a[index]*scale);
//o[index] = __fdividef(round(a[index]*scale),scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] =  __fdividef(o_int*1.,scale);
    o1[0] = scale;
    mov_max[0] = mov_max_tmp;


global void fake_quantize_channel_aciq(float* restrict a,
float* o,
float* o1,
int size,
int bit_width,
float* max_entry_arr, //max_entry_arr already>0
int channel_num)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size)
int channel = index/channel_num;
float* max_entry = max_entry_arr+channel;
float data_scale = __powf(2.,bit_width-1.)-1;
if((*max_entry) < 1e-6)
//if(index%channel_num==0) o1[channel] = scale;
*max_entry = 1e-6;
const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * __logf(4.)));
double std = ((*max_entry) * 2 * gaussian_const) / sqrt(2 * __logf(channel_num));
float threshold = (float)(alpha_gaussian[bit_width - 1] * std);

float scale =  __fdividef(data_scale,threshold);
int o_int = round(a[index]*scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] = __fdividef(o_int*1.,scale);
if(index%channel_num==0) o1[channel] = scale;

global void fake_quantize_channel_cuda(float* restrict a,
float* o,
float* o1,
int size,
int bit_width,
float* max_entry_arr, //max_entry_arr already>0
int channel_num)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size)
int channel = index/channel_num;
float* max_entry = max_entry_arr+channel;
float data_scale = __powf(2.,bit_width-1.)-1;
if((*max_entry) < 1e-6)
//if(index%channel_num==0) o1[channel] = scale;
*max_entry = 1e-6;
float scale = __fdividef(data_scale,max_entry);
o[index] = __fdividef(round(a[index]scale),scale);
if(index%channel_num==0) o1[channel] = scale;
std::vector fake_quantize_activate_cuda(Tensor a, int bit_width ,int aciq)
auto o = at::zeros_like(a); //q out
auto o1 = at::zeros({1}, a.options()); //scale
auto mov_max = at::zeros({1}, a.options()); //max of tensor #GOOGLE QAT movMax = movMax
momenta + max(abs(tensor))
(1-momenta) momenta = 0.95
int64_t size = a.numel();

int batch_size = a.size(0);//batchsize
int feature_pixl_num = size/batch_size;

Tensor max_entry = at::max(at::abs(a));
int blockNums = (size + blockSize - 1) / blockSize;

if(aciq==0) //movmax
fake_quantize_layer_google<<<blockNums, blockSize>>>(a.data_ptr<float>(),
else // aciq
fake_quantize_layer_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
return {o,o1,mov_max};


std::vector fake_quantize_weight_cuda(Tensor a, int bit_width,int c ,int aciq)
auto o = at::zeros_like(a); //q out
auto o1 = at::zeros({c}, a.options()); //scale
int64_t size = a.numel();

int blockNums = (size + blockSize - 1) / blockSize;
int channel_num = size/c;
auto max_entry_arr = at::zeros({c}, a.options());

int lg_n = ceil(log2(channel_num*1.)); //2^x - channel_num >0 
lg_n = pow(2,lg_n); //2^x
if(lg_n>blockSize) lg_n=blockSize; //

max_reduce <<<c, blockSize >>> (a.data_ptr<float>(),
			    lg_n); //c block , each block get a max value

fake_quantize_channel_cuda<<<blockNums, blockSize>>>(a.data_ptr<float>(),
						     max_entry_arr.data_ptr<float>(),  //max_entry_arr already>0
fake_quantize_channel_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
						     max_entry_arr.data_ptr<float>(),  //max_entry_arr already>0

return {o,o1};


std::vector fake_quantize_cuda(Tensor a, int bit_width,int type,int c,int aciq)
/* 2.5
For weights,we use the actual minimum and maximum values to determine the quantizer parameters.
For activations, we use the moving average of the minimum and maximum values across batches to determine the quantizer parameters.
float 6 7 ,double 15 16
if(type==0) return fake_quantize_activate_cuda(a,bit_width,aciq); //type==0 per layer
else return fake_quantize_weight_cuda(a,bit_width,c,aciq); //type==1 perchannel

Copy link


Copy link

torch >= 1.6
numpy >= 1.18.1
onnx >= 1.7.0
onnx-simplifier >= 0.3.6


Copy link


Copy link


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
None yet
None yet

No branches or pull requests

3 participants