forked from Tencent/ncnn
-
Notifications
You must be signed in to change notification settings - Fork 18
vulkan notes
nihuini edited this page Sep 2, 2019
·
1 revision
- Y = known work
- ? = shall work, not confirmed
- / = not applied
windows | linux | android | mac | ios | |
---|---|---|---|---|---|
intel | Y | Y | ? | ? | / |
amd | Y | Y | / | ? | / |
nvidia | Y | Y | ? | / | / |
qcom | / | / | Y | / | / |
apple | / | / | / | ? | Y |
arm | / | ? | ? | / | / |
$ sudo dnf install vulkan-devel
$ cmake -DNCNN_VULKAN=ON ..
ncnn::create_gpu_instance();
{
...
}
ncnn::destroy_gpu_instance();
ncnn::Net net;
net.opt.use_vulkan_compute = 1;
ncnn::VkAllocator* blob_vkallocator = vkdev.acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev.acquire_blob_allocator();
net.opt.blob_vkallocator = blob_vkallocator;
net.opt.workspace_vkallocator = blob_vkallocator;
net.opt.staging_vkallocator = staging_vkallocator;
// ....
// after inference
vkdev.reclaim_blob_allocator(blob_vkallocator);
vkdev.reclaim_staging_allocator(staging_vkallocator);
// get gpu count
int gpu_count = ncnn::get_gpu_count();
// set specified vulkan device before loading param and model
net.set_vulkan_device(0); // use device-0
net.set_vulkan_device(1); // use device-1
ncnn::VkMat blob_gpu;
ncnn::Mat mapped = blob_gpu.mapped();
// use mapped.data directly
ncnn::Extractor ex_cpu = net.create_extractor();
ncnn::Extractor ex_gpu = net.create_extractor();
ex_cpu.set_vulkan_compute(false);
ex_gpu.set_vulkan_compute(true);
#pragma omp parallel sections
{
#pragma omp section
{
ex_cpu.input();
ex_cpu.extract();
}
#pragma omp section
{
ex_gpu.input();
ex_gpu.extract();
}
}
ncnn::Extractor ex1 = net1.create_extractor();
ncnn::Extractor ex2 = net2.create_extractor();
ncnn::VkCompute cmd(&vkdev);
ncnn::VkMat conv1;
ncnn::VkMat conv2;
ncnn::VkMat conv3;
ex1.input("conv1", conv1);
ex1.extract("conv2", conv2, cmd);
ex2.input("conv2", conv2);
ex2.extract("conv3", conv3, cmd);
cmd.submit();
cmd.wait();
int max_batch_size = vkdev->info.compute_queue_count;
ncnn::Mat inputs[1000];
ncnn::Mat outputs[1000];
#pragma omp parallel for num_threads(max_batch_size)
for (int i=0; i<1000; i++)
{
ncnn::Extractor ex = net1.create_extractor();
ex.input("data", inputs[i]);
ex.extract("prob", outputs[i]);
}
disable all lower-precision optimzations, get full fp32 precision
ncnn::Net net;
net.opt.use_fp16_packed = false;
net.opt.use_fp16_storage = false;
net.opt.use_fp16_arithmetic = false;
net.opt.use_int8_storage = false;
net.opt.use_int8_arithmetic = false;
#define ENABLE_VALIDATION_LAYER 1 // modify to 1 in gpu.cpp
-
add vulkan shader in src/layer/shader/
-
upload model weight data in Layer::upload_model()
-
setup pipeline in Layer::create_pipeline()
-
destroy pipeline in Layer::destroy_pipeline()
-
record command in Layer::forward()
-
add vulkan shader in src/layer/shader/ named XXX_abc.comp
-
create pipeline with "XXX_abc"
-
record command using XXX_abc pipeline
-
create layer
-
load param and load model
-
upload model
-
create pipeline
-
new command
-
record
-
submit and wait