Skip to content

Commit aa54856

Browse files
Merge pull request #42 from nicolas-chaulet/instance_iou
Instance iou
2 parents 81f0f7f + 8f4ec9e commit aa54856

File tree

17 files changed

+345
-123
lines changed

17 files changed

+345
-123
lines changed

.github/workflows/tests.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@ jobs:
2222
- name: Install dependencies
2323
run: |
2424
python -m pip install --upgrade pip
25-
pip install numpy scikit-learn flake8 setuptools numba
26-
25+
pip install numpy scikit-learn flake8 setuptools numba==0.49.1
26+
2727
- name: Install torch windows + linux
2828
if: ${{matrix.os != 'macos-latest'}}
2929
run: pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
3030
- name: Install torch macos
3131
if: ${{matrix.os == 'macos-latest'}}
32-
run: pip install torch
33-
32+
run: pip install torch
33+
3434
- name: Build package
3535
run: |
3636
python setup.py build_ext --inplace

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Additions
44
- Clustering algorithm for [PointGroup](https://arxiv.org/pdf/2004.01658.pdf)
5+
- Instance IoU computation on CPU and GPU
56

67
## Change
78
- Force no ninja for the compilation

cpu/include/utils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#pragma once
22
#include <torch/extension.h>
33

4-
#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be a CPU tensor")
4+
#define CHECK_CPU(x) AT_ASSERTM(!x.is_cuda(), #x " must be a CPU tensor")
55

6-
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")
6+
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")

cuda/include/metrics.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#pragma once
2+
#include <torch/extension.h>
3+
4+
at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
5+
at::Tensor gt_instances, at::Tensor gt_instance_sizes,
6+
at::Tensor num_gt_instances, at::Tensor batch);

cuda/include/utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#define CHECK_CUDA(x) \
66
do \
77
{ \
8-
TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor"); \
8+
TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor"); \
99
} while (0)
1010

1111
#define CHECK_CONTIGUOUS(x) \

cuda/src/ball_query.cpp

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,18 @@ std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tenso
1919
CHECK_IS_FLOAT(new_xyz);
2020
CHECK_IS_FLOAT(xyz);
2121

22-
if (new_xyz.type().is_cuda())
23-
{
24-
CHECK_CUDA(xyz);
25-
}
22+
CHECK_CUDA(xyz);
23+
CHECK_CUDA(new_xyz);
2624

2725
at::Tensor idx = torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
2826
at::device(new_xyz.device()).dtype(at::ScalarType::Long));
2927
at::Tensor dist = torch::full({new_xyz.size(0), new_xyz.size(1), nsample}, -1,
3028
at::device(new_xyz.device()).dtype(at::ScalarType::Float));
3129

32-
if (new_xyz.type().is_cuda())
33-
{
34-
query_ball_point_kernel_dense_wrapper(
35-
xyz.size(0), xyz.size(1), new_xyz.size(1), radius, nsample, new_xyz.DATA_PTR<float>(),
36-
xyz.DATA_PTR<float>(), idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
37-
}
38-
else
39-
{
40-
TORCH_CHECK(false, "CPU not supported");
41-
}
30+
query_ball_point_kernel_dense_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), radius,
31+
nsample, new_xyz.DATA_PTR<float>(), xyz.DATA_PTR<float>(),
32+
idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
33+
4234
return std::make_pair(idx, dist);
4335
}
4436

@@ -57,14 +49,10 @@ std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Ten
5749
CHECK_CONTIGUOUS(y);
5850
CHECK_IS_FLOAT(x);
5951
CHECK_IS_FLOAT(y);
60-
61-
if (x.type().is_cuda())
62-
{
63-
CHECK_CUDA(x);
64-
CHECK_CUDA(y);
65-
CHECK_CUDA(batch_x);
66-
CHECK_CUDA(batch_y);
67-
}
52+
CHECK_CUDA(x);
53+
CHECK_CUDA(y);
54+
CHECK_CUDA(batch_x);
55+
CHECK_CUDA(batch_y);
6856

6957
at::Tensor idx =
7058
torch::full({y.size(0), nsample}, -1, at::device(y.device()).dtype(at::ScalarType::Long));
@@ -83,17 +71,10 @@ std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Ten
8371
batch_y = degree(batch_y, batch_size);
8472
batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
8573

86-
if (x.type().is_cuda())
87-
{
88-
query_ball_point_kernel_partial_wrapper(batch_size, x.size(0), y.size(0), radius, nsample,
89-
x.DATA_PTR<float>(), y.DATA_PTR<float>(),
90-
batch_x.DATA_PTR<long>(), batch_y.DATA_PTR<long>(),
91-
idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
92-
}
93-
else
94-
{
95-
TORCH_CHECK(false, "CPU not supported");
96-
}
74+
query_ball_point_kernel_partial_wrapper(batch_size, x.size(0), y.size(0), radius, nsample,
75+
x.DATA_PTR<float>(), y.DATA_PTR<float>(),
76+
batch_x.DATA_PTR<long>(), batch_y.DATA_PTR<long>(),
77+
idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
9778

9879
return std::make_pair(idx, dist);
9980
}

cuda/src/bindings.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ball_query.h"
22
#include "interpolate.h"
3+
#include "metrics.h"
34
#include "sampling.h"
45

56
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
@@ -12,4 +13,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
1213

1314
m.def("ball_query_dense", &ball_query_dense);
1415
m.def("ball_query_partial_dense", &ball_query_partial_dense);
16+
17+
m.def("instance_iou_cuda", &instance_iou_cuda);
1518
}

cuda/src/interpolate.cpp

Lines changed: 16 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,17 @@ std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows)
1616
CHECK_IS_FLOAT(unknowns);
1717
CHECK_IS_FLOAT(knows);
1818

19-
if (unknowns.type().is_cuda())
20-
{
21-
CHECK_CUDA(knows);
22-
}
19+
CHECK_CUDA(knows);
20+
CHECK_CUDA(unknowns);
2321

2422
at::Tensor idx = torch::zeros({unknowns.size(0), unknowns.size(1), 3},
2523
at::device(unknowns.device()).dtype(at::ScalarType::Int));
2624
at::Tensor dist2 = torch::zeros({unknowns.size(0), unknowns.size(1), 3},
2725
at::device(unknowns.device()).dtype(at::ScalarType::Float));
2826

29-
if (unknowns.type().is_cuda())
30-
{
31-
three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
32-
unknowns.DATA_PTR<float>(), knows.DATA_PTR<float>(),
33-
dist2.DATA_PTR<float>(), idx.DATA_PTR<int>());
34-
}
35-
else
36-
{
37-
TORCH_CHECK(false, "CPU not supported");
38-
}
27+
three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
28+
unknowns.DATA_PTR<float>(), knows.DATA_PTR<float>(),
29+
dist2.DATA_PTR<float>(), idx.DATA_PTR<int>());
3930

4031
return {dist2, idx};
4132
}
@@ -49,25 +40,15 @@ at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weigh
4940
CHECK_IS_INT(idx);
5041
CHECK_IS_FLOAT(weight);
5142

52-
if (points.type().is_cuda())
53-
{
54-
CHECK_CUDA(idx);
55-
CHECK_CUDA(weight);
56-
}
43+
CHECK_CUDA(idx);
44+
CHECK_CUDA(weight);
5745

5846
at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1)},
5947
at::device(points.device()).dtype(at::ScalarType::Float));
6048

61-
if (points.type().is_cuda())
62-
{
63-
three_interpolate_kernel_wrapper(points.size(0), points.size(1), points.size(2),
64-
idx.size(1), points.DATA_PTR<float>(), idx.DATA_PTR<int>(),
65-
weight.DATA_PTR<float>(), output.DATA_PTR<float>());
66-
}
67-
else
68-
{
69-
TORCH_CHECK(false, "CPU not supported");
70-
}
49+
three_interpolate_kernel_wrapper(points.size(0), points.size(1), points.size(2), idx.size(1),
50+
points.DATA_PTR<float>(), idx.DATA_PTR<int>(),
51+
weight.DATA_PTR<float>(), output.DATA_PTR<float>());
7152

7253
return output;
7354
}
@@ -80,26 +61,16 @@ at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tenso
8061
CHECK_IS_FLOAT(grad_out);
8162
CHECK_IS_INT(idx);
8263
CHECK_IS_FLOAT(weight);
83-
84-
if (grad_out.type().is_cuda())
85-
{
86-
CHECK_CUDA(idx);
87-
CHECK_CUDA(weight);
88-
}
64+
CHECK_CUDA(idx);
65+
CHECK_CUDA(weight);
66+
CHECK_CUDA(grad_out);
8967

9068
at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m},
9169
at::device(grad_out.device()).dtype(at::ScalarType::Float));
9270

93-
if (grad_out.type().is_cuda())
94-
{
95-
three_interpolate_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), grad_out.size(2),
96-
m, grad_out.DATA_PTR<float>(), idx.DATA_PTR<int>(),
97-
weight.DATA_PTR<float>(), output.DATA_PTR<float>());
98-
}
99-
else
100-
{
101-
TORCH_CHECK(false, "CPU not supported");
102-
}
71+
three_interpolate_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
72+
grad_out.DATA_PTR<float>(), idx.DATA_PTR<int>(),
73+
weight.DATA_PTR<float>(), output.DATA_PTR<float>());
10374

10475
return output;
10576
}

cuda/src/metrics.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#include "metrics.h"
2+
#include "compat.h"
3+
#include "utils.h"
4+
5+
void instance_iou_kernel_wrapper(long total_gt_instances, long max_gt_instances,
6+
const long* nInstance, int nProposal, const long* proposals_idx,
7+
const long* proposals_offset, const long* instance_labels,
8+
const long* offset_num_gt_instances, const long* batch,
9+
const long* instance_pointnum, float* proposals_iou);
10+
11+
at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
12+
at::Tensor gt_instances, at::Tensor gt_instance_sizes,
13+
at::Tensor num_gt_instances, at::Tensor batch)
14+
{
15+
CHECK_CONTIGUOUS(instance_idx);
16+
CHECK_CONTIGUOUS(instance_offsets);
17+
CHECK_CONTIGUOUS(gt_instances);
18+
CHECK_CONTIGUOUS(gt_instance_sizes);
19+
CHECK_CONTIGUOUS(num_gt_instances);
20+
CHECK_CONTIGUOUS(batch);
21+
22+
CHECK_CUDA(instance_idx);
23+
CHECK_CUDA(instance_offsets);
24+
CHECK_CUDA(gt_instances);
25+
CHECK_CUDA(gt_instance_sizes);
26+
27+
cudaSetDevice(instance_idx.get_device());
28+
long num_proposed_instances = instance_offsets.size(0) - 1;
29+
auto total_gt_instances = (int64_t*)malloc(sizeof(int64_t));
30+
cudaMemcpy(total_gt_instances, num_gt_instances.sum().DATA_PTR<int64_t>(), sizeof(int64_t),
31+
cudaMemcpyDeviceToHost);
32+
auto max_gt_instances = (int64_t*)malloc(sizeof(int64_t));
33+
cudaMemcpy(max_gt_instances, num_gt_instances.max().DATA_PTR<int64_t>(), sizeof(int64_t),
34+
cudaMemcpyDeviceToHost);
35+
36+
at::Tensor output =
37+
torch::zeros({num_proposed_instances, total_gt_instances[0]},
38+
at::device(gt_instances.device()).dtype(at::ScalarType::Float));
39+
40+
at::Tensor offset_num_gt_instances =
41+
at::cat({at::zeros(1, num_gt_instances.options()), num_gt_instances.cumsum(0)}, 0);
42+
instance_iou_kernel_wrapper(
43+
total_gt_instances[0], max_gt_instances[0], num_gt_instances.DATA_PTR<long>(),
44+
num_proposed_instances, instance_idx.DATA_PTR<long>(), instance_offsets.DATA_PTR<long>(),
45+
gt_instances.DATA_PTR<long>(), offset_num_gt_instances.DATA_PTR<long>(),
46+
batch.DATA_PTR<long>(), gt_instance_sizes.DATA_PTR<long>(), output.DATA_PTR<float>());
47+
48+
return output;
49+
}

cuda/src/metrics_gpu.cu

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#include <math.h>
2+
#include <stdio.h>
3+
#include <stdlib.h>
4+
5+
#include "cuda_utils.h"
6+
7+
#define THREADS 512
8+
9+
__global__ void instance_iou_cuda_kernel(
10+
long total_gt_instances, const long* __restrict__ nInstance, int nProposal,
11+
const long* __restrict__ proposals_idx, const long* __restrict__ proposals_offset,
12+
const long* __restrict__ instance_labels, const long* __restrict__ offset_num_gt_instances,
13+
const long* __restrict__ batch, const long* __restrict__ instance_pointnum,
14+
float* proposals_iou)
15+
{
16+
for (int proposal_id = blockIdx.x; proposal_id < nProposal; proposal_id += gridDim.x)
17+
{
18+
int start = proposals_offset[proposal_id];
19+
int end = proposals_offset[proposal_id + 1];
20+
int sampleIdx = batch[proposals_idx[start]];
21+
int sampleNInstances = nInstance[sampleIdx];
22+
int instanceOffset = offset_num_gt_instances[sampleIdx];
23+
int proposal_total = end - start;
24+
for (int instance_id = threadIdx.x; instance_id < sampleNInstances;
25+
instance_id += blockDim.x)
26+
{
27+
int instance_total = instance_pointnum[instanceOffset + instance_id];
28+
int intersection = 0;
29+
for (int i = start; i < end; i++)
30+
{
31+
int idx = proposals_idx[i];
32+
if ((int)instance_labels[idx] == instance_id + 1)
33+
{ // 0 is reserved for "no instance"
34+
intersection += 1;
35+
}
36+
}
37+
38+
proposals_iou[instanceOffset + instance_id + proposal_id * total_gt_instances] =
39+
(float)intersection /
40+
((float)(proposal_total + instance_total - intersection) + 1e-5);
41+
}
42+
}
43+
}
44+
45+
// input: proposals_idx (sumNPoint), int
46+
// input: proposals_offset (nProposal + 1), int
47+
// input: instance_labels (N), long, 0~total_nInst-1, -100
48+
// input: instance_pointnum (total_nInst), int
49+
// output: proposals_iou (nProposal, total_nInst), float
50+
void instance_iou_kernel_wrapper(long total_gt_instances, long max_gt_instances,
51+
const long* nInstance, int nProposal, const long* proposals_idx,
52+
const long* proposals_offset, const long* instance_labels,
53+
const long* offset_num_gt_instances, const long* batch,
54+
const long* instance_pointnum, float* proposals_iou)
55+
{
56+
auto stream = at::cuda::getCurrentCUDAStream();
57+
instance_iou_cuda_kernel<<<std::min(nProposal, THREADS * THREADS),
58+
std::min(max_gt_instances, (long)THREADS), 0, stream>>>(
59+
total_gt_instances, nInstance, nProposal, proposals_idx, proposals_offset, instance_labels,
60+
offset_num_gt_instances, batch, instance_pointnum, proposals_iou);
61+
}

0 commit comments

Comments
 (0)