diff --git a/docs/en/understand_mmcv/ops.md b/docs/en/understand_mmcv/ops.md index 262711d21c..c49855cfe1 100644 --- a/docs/en/understand_mmcv/ops.md +++ b/docs/en/understand_mmcv/ops.md @@ -6,7 +6,7 @@ We implement common ops used in detection, segmentation, etc. | ---------------------------- | --- | ---- | --- | --- | ------ | | ActiveRotatedFilter | √ | √ | | | | | AssignScoreWithK | | √ | | | | -| BallQuery | | √ | | | | +| BallQuery | | √ | √ | | | | BBoxOverlaps | | √ | √ | √ | | | BorderAlign | | √ | | | | | BoxIouRotated | √ | √ | | | | diff --git a/docs/zh_cn/understand_mmcv/ops.md b/docs/zh_cn/understand_mmcv/ops.md index a15392e186..2918600212 100644 --- a/docs/zh_cn/understand_mmcv/ops.md +++ b/docs/zh_cn/understand_mmcv/ops.md @@ -6,7 +6,7 @@ MMCV 提供了检测、分割等任务中常用的算子 | ---------------------------- | --- | ---- | --- | --- | ------ | | ActiveRotatedFilter | √ | √ | | | | | AssignScoreWithK | | √ | | | | -| BallQuery | | √ | | | | +| BallQuery | | √ | √ | | | | BBoxOverlaps | | √ | √ | √ | | | BorderAlign | | √ | | | | | BoxIouRotated | √ | √ | | | | diff --git a/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp new file mode 100644 index 0000000000..efaae5e8e4 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp @@ -0,0 +1,47 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "mlu_common_helper.h" + +void ball_query_forward_mlu(int b, int n, int m, float min_radius, + float max_radius, int nsample, const Tensor new_xyz, + const Tensor xyz, Tensor idx) { + MluOpTensorDescriptor new_xyz_desc, xyz_desc, idx_desc; + new_xyz_desc.set(new_xyz); + xyz_desc.set(xyz); + idx_desc.set(idx); + + auto new_xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous( + new_xyz, new_xyz.suggest_memory_format()); + auto xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous( + xyz, new_xyz.suggest_memory_format()); + auto idx_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous( + idx, new_xyz.suggest_memory_format()); + + auto new_xyz_impl = torch_mlu::getMluTensorImpl(new_xyz_contiguous); + auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous); + auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous); + auto new_xyz_ptr = new_xyz_impl->cnnlMalloc(); + auto xyz_ptr = xyz_impl->cnnlMalloc(); + auto idx_ptr = idx_impl->cnnlMalloc(); + + auto handle = mluOpGetCurrentHandle(); + mluOpBallQuery(handle, new_xyz_desc.desc(), new_xyz_ptr, xyz_desc.desc(), + xyz_ptr, min_radius, max_radius, nsample, idx_desc.desc(), + idx_ptr); +} + +void ball_query_forward_impl(int b, int n, int m, float min_radius, + float max_radius, int nsample, + const Tensor new_xyz, const Tensor xyz, + Tensor idx); + +REGISTER_DEVICE_IMPL(ball_query_forward_impl, MLU, ball_query_forward_mlu); diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp new file mode 100644 index 0000000000..442c55dbc9 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp @@ -0,0 +1,103 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "mlu_common_helper.h" + +// Descriptors +mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) { + const std::map mapping_type = { + {std::string("c10::Half"), MLUOP_DTYPE_HALF}, + {std::string("float"), MLUOP_DTYPE_FLOAT}, + {std::string("double"), MLUOP_DTYPE_DOUBLE}, + {std::string("int8"), MLUOP_DTYPE_INT8}, + {std::string("signed char"), MLUOP_DTYPE_INT8}, + {std::string("short int"), MLUOP_DTYPE_INT16}, + {std::string("short"), MLUOP_DTYPE_INT16}, + {std::string("int"), MLUOP_DTYPE_INT32}, + {std::string("long int"), MLUOP_DTYPE_INT64}, + {std::string("long"), MLUOP_DTYPE_INT64}, + {std::string("unsigned char"), MLUOP_DTYPE_UINT8}, + {std::string("bool"), MLUOP_DTYPE_BOOL}, + {std::string("c10::complex"), MLUOP_DTYPE_COMPLEX_HALF}, + {std::string("c10::complex"), MLUOP_DTYPE_COMPLEX_FLOAT}}; + + if (mapping_type.find(std::string(data_type.name())) != mapping_type.end()) { + return mapping_type.find(std::string(data_type.name()))->second; + } + return MLUOP_DTYPE_INVALID; +} + +// laytout +mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) { + auto suggest_memory_format = input.suggest_memory_format(); + mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY; + switch (input.dim()) { + case 4: + layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast) + ? MLUOP_LAYOUT_NHWC + : MLUOP_LAYOUT_NCHW; + break; + case 5: + layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast3d) + ? MLUOP_LAYOUT_NDHWC + : MLUOP_LAYOUT_NCDHW; + break; + default: + layout = MLUOP_LAYOUT_ARRAY; + } + return layout; +} + +void MluOpTensorDescriptor::set(Tensor t) { + mluOpDataType_t data_type = getMluOpDataType(t.dtype()); + mluOpTensorLayout_t layout = getMluOpSuggestLayout(t); + int t_dim = t.dim(); + std::vector dim_array; + if (t_dim == 0) { + dim_array.push_back( + 1); // ScalarTensor(0-dim 1-item Tensor) view like size = 1 as default; + } else { + for (int i = 0; i < t_dim; i++) { + dim_array.push_back(static_cast(t.sizes().vec()[i])); + } + } + set_desc(t, layout, data_type, dim_array); +} + +void MluOpTensorDescriptor::set_desc(const at::Tensor& t, + mluOpTensorLayout_t layout, + mluOpDataType_t dtype, + std::vector& dims) { + int dimNb = dims.size(); + mluOpSetTensorDescriptor(desc_, layout, dtype, dimNb, dims.data()); +} + +// Handles +std::once_flag mmcv_mluop_init_flag; +std::mutex mmcv_mluop_mutex; +static std::vector mmcv_mluop_handles; + +mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index) { + std::call_once(mmcv_mluop_init_flag, + []() // Init mmcv_mluop_handles 1-device <-> 1-handle + { + c10::DeviceIndex num_devices = torch_mlu::device_count(); + mmcv_mluop_handles.resize(num_devices); + }); + + if (device_index == -1) { + device_index = torch_mlu::current_device(); + } + std::lock_guard mmcv_mluop_guard(mmcv_mluop_mutex); + auto queue = torch_mlu::getCurrentQueue(device_index).queue(); + mmcv_mluop_handles[device_index].setQueue(queue); + return mmcv_mluop_handles[device_index].handle; +} diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h new file mode 100644 index 0000000000..321aa0df95 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h @@ -0,0 +1,54 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#pragma once +#include +#include + +#include "aten.h" +#include "mlu_op.h" +#include "pytorch_device_registry.hpp" + +#define MLUOP_MAJOR 0 +#define MLUOP_MINOR 4 +#define MLUOP_PATCHLEVEL 1 + +mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type); +mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input); + +class MluOpTensorDescriptor { + public: + MluOpTensorDescriptor() { mluOpCreateTensorDescriptor(&desc_); }; + ~MluOpTensorDescriptor() { mluOpDestroyTensorDescriptor(desc_); } + + void set(at::Tensor); + mluOpTensorDescriptor_t desc() { return desc_; } + + private: + mluOpTensorDescriptor_t desc_; + void set_desc(const at::Tensor&, mluOpTensorLayout_t, mluOpDataType_t, + std::vector& dims); +}; + +mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index = -1); + +class MluOpHandle { + public: + MluOpHandle() : handle(nullptr) { mluOpCreate(&handle); } + ~MluOpHandle() { + if (handle) { + mluOpDestroy(handle); + handle = nullptr; + } + } + void setQueue(cnrtQueue_t queue) { mluOpSetQueue(handle, queue); } + mluOpHandle_t handle; +}; diff --git a/setup.py b/setup.py index 5b357aa3dc..5a7afe86f7 100644 --- a/setup.py +++ b/setup.py @@ -270,6 +270,7 @@ def get_extensions(): include_dirs = [] + extra_objects = [] is_rocm_pytorch = False try: from torch.utils.cpp_extension import ROCM_HOME @@ -300,16 +301,98 @@ def get_extensions(): torch.is_mlu_available()) or \ os.getenv('FORCE_MLU', '0') == '1': from torch_mlu.utils.cpp_extension import MLUExtension + + def get_mluops_version(file_path): + with open(file_path) as f: + for line in f: + if re.search('MLUOP_MAJOR', line): + major = line.strip().split(' ')[2] + if re.search('MLUOP_MINOR', line): + minor = line.strip().split(' ')[2] + if re.search('MLUOP_PATCHLEVEL', line): + patchlevel = line.strip().split(' ')[2] + mluops_version = f'v{major}.{minor}.{patchlevel}' + return mluops_version + + mmcv_mluops_version = get_mluops_version( + './mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h') + mlu_ops_path = os.getenv('MMCV_MLU_OPS_PATH') + if mlu_ops_path: + exists_mluops_version = get_mluops_version( + mlu_ops_path + '/bangc-ops/mlu_op.h') + if exists_mluops_version != mmcv_mluops_version: + print('the version of mlu-ops provided is %s,' + ' while %s is needed.' % + (exists_mluops_version, mmcv_mluops_version)) + exit() + try: + if os.path.exists('mlu-ops'): + if os.path.islink('mlu-ops'): + os.remove('mlu-ops') + os.symlink(mlu_ops_path, 'mlu-ops') + elif os.path.abspath('mlu-ops') != mlu_ops_path: + os.symlink(mlu_ops_path, 'mlu-ops') + else: + os.symlink(mlu_ops_path, 'mlu-ops') + except Exception: + raise FileExistsError( + 'mlu-ops already exists, please move it out,' + 'or rename or remove it.') + else: + if not os.path.exists('mlu-ops'): + import requests + mluops_url = 'https://github.com/Cambricon/mlu-ops/' + \ + 'archive/refs/tags/' + mmcv_mluops_version + '.zip' + req = requests.get(mluops_url) + with open('./mlu-ops.zip', 'wb') as f: + try: + f.write(req.content) + except Exception: + raise ImportError('failed to download mlu-ops') + + from zipfile import BadZipFile, ZipFile + with ZipFile('./mlu-ops.zip', 'r') as archive: + try: + archive.extractall() + dir_name = archive.namelist()[0].split('/')[0] + os.rename(dir_name, 'mlu-ops') + except BadZipFile: + print('invalid mlu-ops.zip file') + else: + exists_mluops_version = get_mluops_version( + './mlu-ops/bangc-ops/mlu_op.h') + if exists_mluops_version != mmcv_mluops_version: + print('the version of provided mlu-ops is %s,' + ' while %s is needed.' % + (exists_mluops_version, mmcv_mluops_version)) + exit() + define_macros += [('MMCV_WITH_MLU', None)] mlu_args = os.getenv('MMCV_MLU_ARGS') - extra_compile_args['cncc'] = [mlu_args] if mlu_args else [] + mluops_includes = [] + mluops_includes.append('-I' + + os.path.abspath('./mlu-ops/bangc-ops')) + mluops_includes.append( + '-I' + os.path.abspath('./mlu-ops/bangc-ops/kernels')) + extra_compile_args['cncc'] = [mlu_args] + \ + mluops_includes if mlu_args else mluops_includes + extra_compile_args['cxx'] += ['-fno-gnu-unique'] op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \ glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \ glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \ - glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu') + glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu') + \ + glob.glob( + './mlu-ops/bangc-ops/core/**/*.cpp', recursive=True) + \ + glob.glob( + './mlu-ops/bangc-ops/kernels/**/*.cpp', recursive=True) + \ + glob.glob( + './mlu-ops/bangc-ops/kernels/**/*.mlu', recursive=True) + extra_objects = glob.glob( + './mlu-ops/bangc-ops/kernels/*/x86_64/*.o') extension = MLUExtension include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common')) include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu')) + include_dirs.append(os.path.abspath('./mlu-ops/bangc-ops')) elif (hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()) or os.getenv( 'FORCE_MPS', '0') == '1': @@ -371,6 +454,7 @@ def get_extensions(): sources=op_files, include_dirs=include_dirs, define_macros=define_macros, + extra_objects=extra_objects, extra_compile_args=extra_compile_args) extensions.append(ext_ops) diff --git a/tests/test_ops/test_ball_query.py b/tests/test_ops/test_ball_query.py index d3fc7912c5..a3f6518197 100644 --- a/tests/test_ops/test_ball_query.py +++ b/tests/test_ops/test_ball_query.py @@ -3,55 +3,59 @@ import torch from mmcv.ops import ball_query +from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE -@pytest.mark.skipif( - not torch.cuda.is_available(), reason='requires CUDA support') -def test_ball_query(): - new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], - [-2.2769, 2.7817, -0.2334], - [-0.4003, 2.4666, -0.5116], - [-0.0740, 1.3147, -1.3625], - [-0.0740, 1.3147, -1.3625]], - [[-2.0289, 2.4952, -0.1708], - [-2.0668, 6.0278, -0.4875], - [0.4066, 1.4211, -0.2947], - [-2.0289, 2.4952, -0.1708], - [-2.0289, 2.4952, -0.1708]]]).cuda() +@pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) +]) +def test_ball_query(device): + new_xyz = torch.tensor( + [[[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334], + [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625], + [-0.0740, 1.3147, -1.3625]], + [[-2.0289, 2.4952, -0.1708], [-2.0668, 6.0278, -0.4875], + [0.4066, 1.4211, -0.2947], [-2.0289, 2.4952, -0.1708], + [-2.0289, 2.4952, -0.1708]]], + device=device) - xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634], - [-0.4003, 2.4666, - -0.5116], [-0.5251, 2.4379, -0.8466], - [-0.9691, 1.1418, - -1.3733], [-0.2232, 0.9561, -1.3626], - [-2.2769, 2.7817, -0.2334], - [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432], - [0.4917, 1.1529, -1.3496]], - [[-2.0289, 2.4952, - -0.1708], [-0.7188, 0.9956, -0.5096], - [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610], - [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791], - [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947], - [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, - -1.2000]]]).cuda() + xyz = torch.tensor( + [[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634], + [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466], + [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626], + [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645], + [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496]], + [[-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096], + [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610], + [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791], + [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947], + [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]]], + device=device) idx = ball_query(0, 0.2, 5, xyz, new_xyz) - expected_idx = torch.tensor([[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], - [2, 2, 2, 2, 2], [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]], - [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], - [7, 7, 7, 7, 7], [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]]).cuda() + expected_idx = torch.tensor( + [[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]], + [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], + device=device) assert torch.all(idx == expected_idx) # test dilated ball query idx = ball_query(0.2, 0.4, 5, xyz, new_xyz) - expected_idx = torch.tensor([[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], - [2, 3, 2, 2, 2], [0, 5, 7, 0, 0], - [0, 5, 7, 0, 0]], - [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], - [7, 7, 7, 7, 7], [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]]).cuda() + expected_idx = torch.tensor( + [[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], [2, 3, 2, 2, 2], [0, 5, 7, 0, 0], + [0, 5, 7, 0, 0]], + [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], + device=device) assert torch.all(idx == expected_idx)