torch-points3d
diff --git a/‎.github/workflows/tests.yaml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/tests.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpu/include/utils.h‎
Lines changed: 2 additions & 2 deletions b/‎cpu/include/utils.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda/include/metrics.h‎
Lines changed: 6 additions & 0 deletions b/‎cuda/include/metrics.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cuda/include/utils.h‎
Lines changed: 1 addition & 1 deletion b/‎cuda/include/utils.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda/src/ball_query.cpp‎
Lines changed: 14 additions & 33 deletions b/‎cuda/src/ball_query.cpp‎
Lines changed: 14 additions & 33 deletions
diff --git a/‎cuda/src/bindings.cpp‎
Lines changed: 3 additions & 0 deletions b/‎cuda/src/bindings.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cuda/src/interpolate.cpp‎
Lines changed: 16 additions & 45 deletions b/‎cuda/src/interpolate.cpp‎
Lines changed: 16 additions & 45 deletions
diff --git a/‎cuda/src/metrics.cpp‎
Lines changed: 49 additions & 0 deletions b/‎cuda/src/metrics.cpp‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎cuda/src/metrics_gpu.cu‎
Lines changed: 61 additions & 0 deletions b/‎cuda/src/metrics_gpu.cu‎
Lines changed: 61 additions & 0 deletions
@@ -22,15 +22,15 @@ jobs:
         - name: Install dependencies
           run: |
               python -m pip install --upgrade pip
-              pip install numpy scikit-learn flake8 setuptools numba
-              
+              pip install numpy scikit-learn flake8 setuptools numba==0.49.1
+
         - name: Install torch windows + linux
           if: ${{matrix.os != 'macos-latest'}}
           run: pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
         - name: Install torch macos
           if: ${{matrix.os == 'macos-latest'}}
-          run: pip install torch 
-            
+          run: pip install torch
+
         - name: Build package
           run: |
               python setup.py build_ext --inplace
 
@@ -2,6 +2,7 @@
 
 ## Additions
 - Clustering algorithm for [PointGroup](https://arxiv.org/pdf/2004.01658.pdf)
+- Instance IoU computation on CPU and GPU
 
 ## Change
 - Force no ninja for the compilation
 
@@ -1,6 +1,6 @@
 #pragma once
 #include <torch/extension.h>
 
-#define CHECK_CPU(x) AT_ASSERTM(!x.type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CPU(x) AT_ASSERTM(!x.is_cuda(), #x " must be a CPU tensor")
 
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be a contiguous tensor")
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
+                             at::Tensor gt_instances, at::Tensor gt_instance_sizes,
+                             at::Tensor num_gt_instances, at::Tensor batch);
@@ -5,7 +5,7 @@
 #define CHECK_CUDA(x)                                                                              \
     do                                                                                             \
     {                                                                                              \
-        TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor");                              \
+        TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor");                                     \
     } while (0)
 
 #define CHECK_CONTIGUOUS(x)                                                                        \
 
@@ -19,26 +19,18 @@ std::pair<at::Tensor, at::Tensor> ball_query_dense(at::Tensor new_xyz, at::Tenso
     CHECK_IS_FLOAT(new_xyz);
     CHECK_IS_FLOAT(xyz);
 
-    if (new_xyz.type().is_cuda())
-    {
-        CHECK_CUDA(xyz);
-    }
+    CHECK_CUDA(xyz);
+    CHECK_CUDA(new_xyz);
 
     at::Tensor idx = torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
                                   at::device(new_xyz.device()).dtype(at::ScalarType::Long));
     at::Tensor dist = torch::full({new_xyz.size(0), new_xyz.size(1), nsample}, -1,
                                   at::device(new_xyz.device()).dtype(at::ScalarType::Float));
 
-    if (new_xyz.type().is_cuda())
-    {
-        query_ball_point_kernel_dense_wrapper(
-            xyz.size(0), xyz.size(1), new_xyz.size(1), radius, nsample, new_xyz.DATA_PTR<float>(),
-            xyz.DATA_PTR<float>(), idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
-    }
-    else
-    {
-        TORCH_CHECK(false, "CPU not supported");
-    }
+    query_ball_point_kernel_dense_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), radius,
+                                          nsample, new_xyz.DATA_PTR<float>(), xyz.DATA_PTR<float>(),
+                                          idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
+
     return std::make_pair(idx, dist);
 }
 
@@ -57,14 +49,10 @@ std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Ten
     CHECK_CONTIGUOUS(y);
     CHECK_IS_FLOAT(x);
     CHECK_IS_FLOAT(y);
-
-    if (x.type().is_cuda())
-    {
-        CHECK_CUDA(x);
-        CHECK_CUDA(y);
-        CHECK_CUDA(batch_x);
-        CHECK_CUDA(batch_y);
-    }
+    CHECK_CUDA(x);
+    CHECK_CUDA(y);
+    CHECK_CUDA(batch_x);
+    CHECK_CUDA(batch_y);
 
     at::Tensor idx =
         torch::full({y.size(0), nsample}, -1, at::device(y.device()).dtype(at::ScalarType::Long));
@@ -83,17 +71,10 @@ std::pair<at::Tensor, at::Tensor> ball_query_partial_dense(at::Tensor x, at::Ten
     batch_y = degree(batch_y, batch_size);
     batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
 
-    if (x.type().is_cuda())
-    {
-        query_ball_point_kernel_partial_wrapper(batch_size, x.size(0), y.size(0), radius, nsample,
-                                                x.DATA_PTR<float>(), y.DATA_PTR<float>(),
-                                                batch_x.DATA_PTR<long>(), batch_y.DATA_PTR<long>(),
-                                                idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
-    }
-    else
-    {
-        TORCH_CHECK(false, "CPU not supported");
-    }
+    query_ball_point_kernel_partial_wrapper(batch_size, x.size(0), y.size(0), radius, nsample,
+                                            x.DATA_PTR<float>(), y.DATA_PTR<float>(),
+                                            batch_x.DATA_PTR<long>(), batch_y.DATA_PTR<long>(),
+                                            idx.DATA_PTR<long>(), dist.DATA_PTR<float>());
 
     return std::make_pair(idx, dist);
 }
@@ -1,5 +1,6 @@
 #include "ball_query.h"
 #include "interpolate.h"
+#include "metrics.h"
 #include "sampling.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
@@ -12,4 +13,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 
     m.def("ball_query_dense", &ball_query_dense);
     m.def("ball_query_partial_dense", &ball_query_partial_dense);
+
+    m.def("instance_iou_cuda", &instance_iou_cuda);
 }
@@ -16,26 +16,17 @@ std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows)
     CHECK_IS_FLOAT(unknowns);
     CHECK_IS_FLOAT(knows);
 
-    if (unknowns.type().is_cuda())
-    {
-        CHECK_CUDA(knows);
-    }
+    CHECK_CUDA(knows);
+    CHECK_CUDA(unknowns);
 
     at::Tensor idx = torch::zeros({unknowns.size(0), unknowns.size(1), 3},
                                   at::device(unknowns.device()).dtype(at::ScalarType::Int));
     at::Tensor dist2 = torch::zeros({unknowns.size(0), unknowns.size(1), 3},
                                     at::device(unknowns.device()).dtype(at::ScalarType::Float));
 
-    if (unknowns.type().is_cuda())
-    {
-        three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
-                                unknowns.DATA_PTR<float>(), knows.DATA_PTR<float>(),
-                                dist2.DATA_PTR<float>(), idx.DATA_PTR<int>());
-    }
-    else
-    {
-        TORCH_CHECK(false, "CPU not supported");
-    }
+    three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
+                            unknowns.DATA_PTR<float>(), knows.DATA_PTR<float>(),
+                            dist2.DATA_PTR<float>(), idx.DATA_PTR<int>());
 
     return {dist2, idx};
 }
@@ -49,25 +40,15 @@ at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weigh
     CHECK_IS_INT(idx);
     CHECK_IS_FLOAT(weight);
 
-    if (points.type().is_cuda())
-    {
-        CHECK_CUDA(idx);
-        CHECK_CUDA(weight);
-    }
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
 
     at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1)},
                                      at::device(points.device()).dtype(at::ScalarType::Float));
 
-    if (points.type().is_cuda())
-    {
-        three_interpolate_kernel_wrapper(points.size(0), points.size(1), points.size(2),
-                                         idx.size(1), points.DATA_PTR<float>(), idx.DATA_PTR<int>(),
-                                         weight.DATA_PTR<float>(), output.DATA_PTR<float>());
-    }
-    else
-    {
-        TORCH_CHECK(false, "CPU not supported");
-    }
+    three_interpolate_kernel_wrapper(points.size(0), points.size(1), points.size(2), idx.size(1),
+                                     points.DATA_PTR<float>(), idx.DATA_PTR<int>(),
+                                     weight.DATA_PTR<float>(), output.DATA_PTR<float>());
 
     return output;
 }
@@ -80,26 +61,16 @@ at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tenso
     CHECK_IS_FLOAT(grad_out);
     CHECK_IS_INT(idx);
     CHECK_IS_FLOAT(weight);
-
-    if (grad_out.type().is_cuda())
-    {
-        CHECK_CUDA(idx);
-        CHECK_CUDA(weight);
-    }
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+    CHECK_CUDA(grad_out);
 
     at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m},
                                      at::device(grad_out.device()).dtype(at::ScalarType::Float));
 
-    if (grad_out.type().is_cuda())
-    {
-        three_interpolate_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), grad_out.size(2),
-                                              m, grad_out.DATA_PTR<float>(), idx.DATA_PTR<int>(),
-                                              weight.DATA_PTR<float>(), output.DATA_PTR<float>());
-    }
-    else
-    {
-        TORCH_CHECK(false, "CPU not supported");
-    }
+    three_interpolate_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
+                                          grad_out.DATA_PTR<float>(), idx.DATA_PTR<int>(),
+                                          weight.DATA_PTR<float>(), output.DATA_PTR<float>());
 
     return output;
 }
@@ -0,0 +1,49 @@
+#include "metrics.h"
+#include "compat.h"
+#include "utils.h"
+
+void instance_iou_kernel_wrapper(long total_gt_instances, long max_gt_instances,
+                                 const long* nInstance, int nProposal, const long* proposals_idx,
+                                 const long* proposals_offset, const long* instance_labels,
+                                 const long* offset_num_gt_instances, const long* batch,
+                                 const long* instance_pointnum, float* proposals_iou);
+
+at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
+                             at::Tensor gt_instances, at::Tensor gt_instance_sizes,
+                             at::Tensor num_gt_instances, at::Tensor batch)
+{
+    CHECK_CONTIGUOUS(instance_idx);
+    CHECK_CONTIGUOUS(instance_offsets);
+    CHECK_CONTIGUOUS(gt_instances);
+    CHECK_CONTIGUOUS(gt_instance_sizes);
+    CHECK_CONTIGUOUS(num_gt_instances);
+    CHECK_CONTIGUOUS(batch);
+
+    CHECK_CUDA(instance_idx);
+    CHECK_CUDA(instance_offsets);
+    CHECK_CUDA(gt_instances);
+    CHECK_CUDA(gt_instance_sizes);
+
+    cudaSetDevice(instance_idx.get_device());
+    long num_proposed_instances = instance_offsets.size(0) - 1;
+    auto total_gt_instances = (int64_t*)malloc(sizeof(int64_t));
+    cudaMemcpy(total_gt_instances, num_gt_instances.sum().DATA_PTR<int64_t>(), sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    auto max_gt_instances = (int64_t*)malloc(sizeof(int64_t));
+    cudaMemcpy(max_gt_instances, num_gt_instances.max().DATA_PTR<int64_t>(), sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+
+    at::Tensor output =
+        torch::zeros({num_proposed_instances, total_gt_instances[0]},
+                     at::device(gt_instances.device()).dtype(at::ScalarType::Float));
+
+    at::Tensor offset_num_gt_instances =
+        at::cat({at::zeros(1, num_gt_instances.options()), num_gt_instances.cumsum(0)}, 0);
+    instance_iou_kernel_wrapper(
+        total_gt_instances[0], max_gt_instances[0], num_gt_instances.DATA_PTR<long>(),
+        num_proposed_instances, instance_idx.DATA_PTR<long>(), instance_offsets.DATA_PTR<long>(),
+        gt_instances.DATA_PTR<long>(), offset_num_gt_instances.DATA_PTR<long>(),
+        batch.DATA_PTR<long>(), gt_instance_sizes.DATA_PTR<long>(), output.DATA_PTR<float>());
+
+    return output;
+}
@@ -0,0 +1,61 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+#define THREADS 512
+
+__global__ void instance_iou_cuda_kernel(
+    long total_gt_instances, const long* __restrict__ nInstance, int nProposal,
+    const long* __restrict__ proposals_idx, const long* __restrict__ proposals_offset,
+    const long* __restrict__ instance_labels, const long* __restrict__ offset_num_gt_instances,
+    const long* __restrict__ batch, const long* __restrict__ instance_pointnum,
+    float* proposals_iou)
+{
+    for (int proposal_id = blockIdx.x; proposal_id < nProposal; proposal_id += gridDim.x)
+    {
+        int start = proposals_offset[proposal_id];
+        int end = proposals_offset[proposal_id + 1];
+        int sampleIdx = batch[proposals_idx[start]];
+        int sampleNInstances = nInstance[sampleIdx];
+        int instanceOffset = offset_num_gt_instances[sampleIdx];
+        int proposal_total = end - start;
+        for (int instance_id = threadIdx.x; instance_id < sampleNInstances;
+             instance_id += blockDim.x)
+        {
+            int instance_total = instance_pointnum[instanceOffset + instance_id];
+            int intersection = 0;
+            for (int i = start; i < end; i++)
+            {
+                int idx = proposals_idx[i];
+                if ((int)instance_labels[idx] == instance_id + 1)
+                { // 0 is reserved for "no instance"
+                    intersection += 1;
+                }
+            }
+
+            proposals_iou[instanceOffset + instance_id + proposal_id * total_gt_instances] =
+                (float)intersection /
+                ((float)(proposal_total + instance_total - intersection) + 1e-5);
+        }
+    }
+}
+
+// input: proposals_idx (sumNPoint), int
+// input: proposals_offset (nProposal + 1), int
+// input: instance_labels (N), long, 0~total_nInst-1, -100
+// input: instance_pointnum (total_nInst), int
+// output: proposals_iou (nProposal, total_nInst), float
+void instance_iou_kernel_wrapper(long total_gt_instances, long max_gt_instances,
+                                 const long* nInstance, int nProposal, const long* proposals_idx,
+                                 const long* proposals_offset, const long* instance_labels,
+                                 const long* offset_num_gt_instances, const long* batch,
+                                 const long* instance_pointnum, float* proposals_iou)
+{
+    auto stream = at::cuda::getCurrentCUDAStream();
+    instance_iou_cuda_kernel<<<std::min(nProposal, THREADS * THREADS),
+                               std::min(max_gt_instances, (long)THREADS), 0, stream>>>(
+        total_gt_instances, nInstance, nProposal, proposals_idx, proposals_offset, instance_labels,
+        offset_num_gt_instances, batch, instance_pointnum, proposals_iou);
+}