From cf7b6a6f46aef6b2ab26378088ed3971cd5041ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Wdowski?= Date: Thu, 18 May 2023 21:39:30 +0200 Subject: [PATCH 1/2] Cwt WIP --- dali/kernels/signal/CMakeLists.txt | 1 + dali/kernels/signal/wavelets/CMakeLists.txt | 17 ++++ dali/kernels/signal/wavelets/cwt_args.h | 33 +++++++ dali/kernels/signal/wavelets/cwt_gpu.cu | 98 +++++++++++++++++++ dali/kernels/signal/wavelets/cwt_gpu.h | 50 ++++++++++ dali/operators/signal/CMakeLists.txt | 1 + dali/operators/signal/wavelets/CMakeLists.txt | 17 ++++ dali/operators/signal/wavelets/cwt_op.h | 65 ++++++++++++ dali/operators/signal/wavelets/cwt_op_gpu.cu | 80 +++++++++++++++ 9 files changed, 362 insertions(+) create mode 100644 dali/kernels/signal/wavelets/CMakeLists.txt create mode 100644 dali/kernels/signal/wavelets/cwt_args.h create mode 100644 dali/kernels/signal/wavelets/cwt_gpu.cu create mode 100644 dali/kernels/signal/wavelets/cwt_gpu.h create mode 100644 dali/operators/signal/wavelets/CMakeLists.txt create mode 100644 dali/operators/signal/wavelets/cwt_op.h create mode 100644 dali/operators/signal/wavelets/cwt_op_gpu.cu diff --git a/dali/kernels/signal/CMakeLists.txt b/dali/kernels/signal/CMakeLists.txt index 431ae396295..07b62d53429 100644 --- a/dali/kernels/signal/CMakeLists.txt +++ b/dali/kernels/signal/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(decibel) if (BUILD_FFTS) add_subdirectory(fft) endif() +add_subdirectory(wavelets) add_subdirectory(window) collect_headers(DALI_INST_HDRS PARENT_SCOPE) diff --git a/dali/kernels/signal/wavelets/CMakeLists.txt b/dali/kernels/signal/wavelets/CMakeLists.txt new file mode 100644 index 00000000000..c3ee135e612 --- /dev/null +++ b/dali/kernels/signal/wavelets/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +collect_headers(DALI_INST_HDRS PARENT_SCOPE) +collect_sources(DALI_KERNEL_SRCS PARENT_SCOPE) +collect_test_sources(DALI_KERNEL_TEST_SRCS PARENT_SCOPE) \ No newline at end of file diff --git a/dali/kernels/signal/wavelets/cwt_args.h b/dali/kernels/signal/wavelets/cwt_args.h new file mode 100644 index 00000000000..14f6cb7d5b2 --- /dev/null +++ b/dali/kernels/signal/wavelets/cwt_args.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef DALI_KERNELS_SIGNAL_WAVELETS_CWT_ARGS_H_ +#define DALI_KERNELS_SIGNAL_WAVELETS_CWT_ARGS_H_ + +namespace dali { +namespace kernels { +namespace signal { +namespace wavelets { + +template +struct CwtArgs { + T a; +}; + +} // namespace wavelets +} // namespace signal +} // namespace kernels +} // namespace dali + +#endif // DALI_KERNELS_SIGNAL_WAVELETS_CWT_ARGS_H_ diff --git a/dali/kernels/signal/wavelets/cwt_gpu.cu b/dali/kernels/signal/wavelets/cwt_gpu.cu new file mode 100644 index 00000000000..be2b19bde28 --- /dev/null +++ b/dali/kernels/signal/wavelets/cwt_gpu.cu @@ -0,0 +1,98 @@ +// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "dali/core/common.h" +#include "dali/core/error_handling.h" +#include "dali/core/format.h" +#include "dali/kernels/kernel.h" +#include "dali/kernels/signal/wavelets/cwt_args.h" +#include "dali/kernels/signal/wavelets/cwt_gpu.h" + +namespace dali { +namespace kernels { +namespace signal { +namespace wavelets { + +template +struct SampleDesc { + const T *in = nullptr; + T *out = nullptr; + int64_t size = 0; +}; + +template +__global__ void CwtKernel(const SampleDesc *sample_data, CwtArgs args) { + const int64_t block_size = blockDim.y * blockDim.x; + const int64_t grid_size = gridDim.x * block_size; + const int sample_idx = blockIdx.y; + const auto sample = sample_data[sample_idx]; + const int64_t offset = block_size * blockIdx.x; + const int64_t tid = threadIdx.y * blockDim.x + threadIdx.x; + + for (int64_t idx = offset + tid; idx < sample.size; idx += grid_size) { + sample.out[idx] = sample.in[idx] * args.a; + } +} + +template +CwtGpu::~CwtGpu() = default; + +template +KernelRequirements CwtGpu::Setup(KernelContext &context, + const InListGPU &in) { + auto out_shape = in.shape; + const size_t num_samples = in.size(); + ScratchpadEstimator se; + se.add>(num_samples); + se.add>(num_samples); + KernelRequirements req; + req.scratch_sizes = se.sizes; + req.output_shapes = {out_shape}; + return req; +} + +template +void CwtGpu::Run(KernelContext &context, const OutListGPU &out, + const InListGPU &in, const CwtArgs &args) { + auto num_samples = in.size(); + auto *sample_data = context.scratchpad->AllocateHost>(num_samples); + + for (int i = 0; i < num_samples; i++) { + auto &sample = sample_data[i]; + sample.out = out.tensor_data(i); + sample.in = in.tensor_data(i); + sample.size = volume(in.tensor_shape(i)); + assert(sample.size == volume(out.tensor_shape(i))); + } + + auto *sample_data_gpu = context.scratchpad->AllocateGPU>(num_samples); + CUDA_CALL(cudaMemcpyAsync(sample_data_gpu, sample_data, num_samples * sizeof(SampleDesc), + cudaMemcpyHostToDevice, context.gpu.stream)); + + dim3 block(32, 32); + auto blocks_per_sample = std::max(32, 1024 / num_samples); + dim3 grid(blocks_per_sample, num_samples); + CwtKernel<<>>(sample_data_gpu, args); +} + +template class CwtGpu; +template class CwtGpu; + +} // namespace wavelets +} // namespace signal +} // namespace kernels +} // namespace dali diff --git a/dali/kernels/signal/wavelets/cwt_gpu.h b/dali/kernels/signal/wavelets/cwt_gpu.h new file mode 100644 index 00000000000..2fa4fd939eb --- /dev/null +++ b/dali/kernels/signal/wavelets/cwt_gpu.h @@ -0,0 +1,50 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef DALI_KERNELS_SIGNAL_WAVELETS_CWT_GPU_H_ +#define DALI_KERNELS_SIGNAL_WAVELETS_CWT_GPU_H_ + +#include +#include "dali/core/common.h" +#include "dali/core/error_handling.h" +#include "dali/core/format.h" +#include "dali/core/util.h" +#include "dali/kernels/kernel.h" +#include "dali/kernels/signal/wavelets/cwt_args.h" + +namespace dali { +namespace kernels { +namespace signal { +namespace wavelets { + +template +class DLL_PUBLIC CwtGpu { + public: + static_assert(std::is_floating_point::value, "Only floating point types are supported"); + + DLL_PUBLIC ~CwtGpu(); + + DLL_PUBLIC KernelRequirements Setup(KernelContext &context, + const InListGPU &in); + + DLL_PUBLIC void Run(KernelContext &context, const OutListGPU &out, + const InListGPU &in, const CwtArgs &args); +}; + +} // namespace wavelets +} // namespace signal +} // namespace kernels +} // namespace dali + +#endif // DALI_KERNELS_SIGNAL_WAVELETS_CWT_GPU_H_ diff --git a/dali/operators/signal/CMakeLists.txt b/dali/operators/signal/CMakeLists.txt index 217f785aa24..c16a5d4687b 100644 --- a/dali/operators/signal/CMakeLists.txt +++ b/dali/operators/signal/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(decibel) if (BUILD_FFTS) add_subdirectory(fft) endif() +add_subdirectory(wavelets) collect_headers(DALI_INST_HDRS PARENT_SCOPE) collect_sources(DALI_OPERATOR_SRCS PARENT_SCOPE) diff --git a/dali/operators/signal/wavelets/CMakeLists.txt b/dali/operators/signal/wavelets/CMakeLists.txt new file mode 100644 index 00000000000..0dba230abf0 --- /dev/null +++ b/dali/operators/signal/wavelets/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +collect_headers(DALI_INST_HDRS PARENT_SCOPE) +collect_sources(DALI_OPERATOR_SRCS PARENT_SCOPE) +collect_test_sources(DALI_OPERATOR_TEST_SRCS PARENT_SCOPE) diff --git a/dali/operators/signal/wavelets/cwt_op.h b/dali/operators/signal/wavelets/cwt_op.h new file mode 100644 index 00000000000..3d6e439d493 --- /dev/null +++ b/dali/operators/signal/wavelets/cwt_op.h @@ -0,0 +1,65 @@ +// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef DALI_OPERATORS_SIGNAL_WAVELETS_CWT_H_ +#define DALI_OPERATORS_SIGNAL_WAVELETS_CWT_H_ + +#include +#include +#include "dali/core/common.h" +#include "dali/kernels/signal/wavelets/cwt_args.h" +#include "dali/pipeline/operator/common.h" +#include "dali/pipeline/operator/operator.h" +#include "dali/pipeline/util/operator_impl_utils.h" + +namespace dali { + +template +class Cwt : public Operator { + public: + explicit Cwt(const OpSpec &spec) : Operator(spec) { + if (!spec.HasArgument("a")) { + DALI_ENFORCE("`a` argument must be provided."); + } + args_.a = spec.GetArgument("a"); + } + + protected: + bool CanInferOutputs() const override { + return true; + } + + bool SetupImpl(std::vector &output_desc, const Workspace &ws) override { + assert(impl_ != nullptr); + return impl_->SetupImpl(output_desc, ws); + } + + void RunImpl(Workspace &ws) override { + assert(impl_ != nullptr); + impl_->RunImpl(ws); + } + + USE_OPERATOR_MEMBERS(); + using Operator::RunImpl; + + kernels::KernelManager kmgr_; + kernels::signal::wavelets::CwtArgs args_; + + std::unique_ptr> impl_; + DALIDataType type_ = DALI_NO_TYPE; +}; + +} // namespace dali + +#endif // DALI_OPERATORS_SIGNAL_WAVELETS_CWT_H_ diff --git a/dali/operators/signal/wavelets/cwt_op_gpu.cu b/dali/operators/signal/wavelets/cwt_op_gpu.cu new file mode 100644 index 00000000000..3cea5427d61 --- /dev/null +++ b/dali/operators/signal/wavelets/cwt_op_gpu.cu @@ -0,0 +1,80 @@ +// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "dali/core/static_switch.h" +#include "dali/kernels/kernel_manager.h" +#include "dali/kernels/kernel_params.h" +#include "dali/kernels/signal/wavelets/cwt_args.h" +#include "dali/kernels/signal/wavelets/cwt_gpu.h" +#include "dali/operators/signal/wavelets/cwt_op.h" +#include "dali/pipeline/data/views.h" + +namespace dali { + +DALI_SCHEMA(Cwt).DocStr("by MW").NumInput(1).NumOutput(1).AddArg("a", "costam", + type2id::value); + +template +struct CwtImplGPU : public OpImplBase { + public: + using CwtArgs = kernels::signal::wavelets::CwtArgs; + using CwtKernel = kernels::signal::wavelets::CwtGpu; + + explicit CwtImplGPU(CwtArgs args) : args_(std::move(args)) { + kmgr_cwt_.Resize(1); + } + + bool SetupImpl(std::vector &output_desc, const Workspace &ws) override { + const auto &input = ws.Input(0); + auto in_view = view(input); + + auto type = type2id::value; + + kernels::KernelContext ctx; + ctx.gpu.stream = ws.stream(); + + auto &req = kmgr_cwt_.Setup(0, ctx, in_view); + output_desc.resize(1); + output_desc[0].type = type; + output_desc[0].shape = req.output_shapes[0]; + + return true; + } + + void RunImpl(Workspace &ws) override { + const auto &input = ws.Input(0); + auto &output = ws.Output(0); + + auto in_view = view(input); + auto out_view = view(output); + + kernels::KernelContext ctx; + ctx.gpu.stream = ws.stream(); + + kmgr_cwt_.Run(0, ctx, out_view, in_view, args_); + } + + private: + CwtArgs args_; + kernels::KernelManager kmgr_cwt_; + std::vector cwt_out_desc_; + TensorList cwt_out_; +}; + +DALI_REGISTER_OPERATOR(Cwt, Cwt, GPU); + +} // namespace dali From b0346197840c83a654143f6e34d55d1cef1a8f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Wdowski?= Date: Thu, 18 May 2023 22:20:41 +0200 Subject: [PATCH 2/2] Rename namespace --- dali/kernels/signal/CMakeLists.txt | 2 +- .../signal/{wavelets => wavelet}/CMakeLists.txt | 0 dali/kernels/signal/{wavelets => wavelet}/cwt_args.h | 10 +++++----- dali/kernels/signal/{wavelets => wavelet}/cwt_gpu.cu | 4 ++-- dali/kernels/signal/{wavelets => wavelet}/cwt_gpu.h | 10 +++++----- dali/operators/signal/CMakeLists.txt | 2 +- .../signal/{wavelets => wavelet}/CMakeLists.txt | 0 dali/operators/signal/{wavelets => wavelet}/cwt_op.h | 0 .../signal/{wavelets => wavelet}/cwt_op_gpu.cu | 0 9 files changed, 14 insertions(+), 14 deletions(-) rename dali/kernels/signal/{wavelets => wavelet}/CMakeLists.txt (100%) rename dali/kernels/signal/{wavelets => wavelet}/cwt_args.h (80%) rename dali/kernels/signal/{wavelets => wavelet}/cwt_gpu.cu (98%) rename dali/kernels/signal/{wavelets => wavelet}/cwt_gpu.h (88%) rename dali/operators/signal/{wavelets => wavelet}/CMakeLists.txt (100%) rename dali/operators/signal/{wavelets => wavelet}/cwt_op.h (100%) rename dali/operators/signal/{wavelets => wavelet}/cwt_op_gpu.cu (100%) diff --git a/dali/kernels/signal/CMakeLists.txt b/dali/kernels/signal/CMakeLists.txt index 07b62d53429..74ca2e89701 100644 --- a/dali/kernels/signal/CMakeLists.txt +++ b/dali/kernels/signal/CMakeLists.txt @@ -17,7 +17,7 @@ add_subdirectory(decibel) if (BUILD_FFTS) add_subdirectory(fft) endif() -add_subdirectory(wavelets) +add_subdirectory(wavelet) add_subdirectory(window) collect_headers(DALI_INST_HDRS PARENT_SCOPE) diff --git a/dali/kernels/signal/wavelets/CMakeLists.txt b/dali/kernels/signal/wavelet/CMakeLists.txt similarity index 100% rename from dali/kernels/signal/wavelets/CMakeLists.txt rename to dali/kernels/signal/wavelet/CMakeLists.txt diff --git a/dali/kernels/signal/wavelets/cwt_args.h b/dali/kernels/signal/wavelet/cwt_args.h similarity index 80% rename from dali/kernels/signal/wavelets/cwt_args.h rename to dali/kernels/signal/wavelet/cwt_args.h index 14f6cb7d5b2..b61d064a9e7 100644 --- a/dali/kernels/signal/wavelets/cwt_args.h +++ b/dali/kernels/signal/wavelet/cwt_args.h @@ -12,22 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef DALI_KERNELS_SIGNAL_WAVELETS_CWT_ARGS_H_ -#define DALI_KERNELS_SIGNAL_WAVELETS_CWT_ARGS_H_ +#ifndef DALI_KERNELS_SIGNAL_WAVELET_CWT_ARGS_H_ +#define DALI_KERNELS_SIGNAL_WAVELET_CWT_ARGS_H_ namespace dali { namespace kernels { namespace signal { -namespace wavelets { +namespace wavelet { template struct CwtArgs { T a; }; -} // namespace wavelets +} // namespace wavelet } // namespace signal } // namespace kernels } // namespace dali -#endif // DALI_KERNELS_SIGNAL_WAVELETS_CWT_ARGS_H_ +#endif // DALI_KERNELS_SIGNAL_WAVELET_CWT_ARGS_H_ diff --git a/dali/kernels/signal/wavelets/cwt_gpu.cu b/dali/kernels/signal/wavelet/cwt_gpu.cu similarity index 98% rename from dali/kernels/signal/wavelets/cwt_gpu.cu rename to dali/kernels/signal/wavelet/cwt_gpu.cu index be2b19bde28..a15f82929a4 100644 --- a/dali/kernels/signal/wavelets/cwt_gpu.cu +++ b/dali/kernels/signal/wavelet/cwt_gpu.cu @@ -25,7 +25,7 @@ namespace dali { namespace kernels { namespace signal { -namespace wavelets { +namespace wavelet { template struct SampleDesc { @@ -92,7 +92,7 @@ void CwtGpu::Run(KernelContext &context, const OutListGPU; template class CwtGpu; -} // namespace wavelets +} // namespace wavelet } // namespace signal } // namespace kernels } // namespace dali diff --git a/dali/kernels/signal/wavelets/cwt_gpu.h b/dali/kernels/signal/wavelet/cwt_gpu.h similarity index 88% rename from dali/kernels/signal/wavelets/cwt_gpu.h rename to dali/kernels/signal/wavelet/cwt_gpu.h index 2fa4fd939eb..62f9cef738c 100644 --- a/dali/kernels/signal/wavelets/cwt_gpu.h +++ b/dali/kernels/signal/wavelet/cwt_gpu.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef DALI_KERNELS_SIGNAL_WAVELETS_CWT_GPU_H_ -#define DALI_KERNELS_SIGNAL_WAVELETS_CWT_GPU_H_ +#ifndef DALI_KERNELS_SIGNAL_WAVELET_CWT_GPU_H_ +#define DALI_KERNELS_SIGNAL_WAVELET_CWT_GPU_H_ #include #include "dali/core/common.h" @@ -26,7 +26,7 @@ namespace dali { namespace kernels { namespace signal { -namespace wavelets { +namespace wavelet { template class DLL_PUBLIC CwtGpu { @@ -42,9 +42,9 @@ class DLL_PUBLIC CwtGpu { const InListGPU &in, const CwtArgs &args); }; -} // namespace wavelets +} // namespace wavelet } // namespace signal } // namespace kernels } // namespace dali -#endif // DALI_KERNELS_SIGNAL_WAVELETS_CWT_GPU_H_ +#endif // DALI_KERNELS_SIGNAL_WAVELET_CWT_GPU_H_ diff --git a/dali/operators/signal/CMakeLists.txt b/dali/operators/signal/CMakeLists.txt index c16a5d4687b..44d93c05bae 100644 --- a/dali/operators/signal/CMakeLists.txt +++ b/dali/operators/signal/CMakeLists.txt @@ -16,7 +16,7 @@ add_subdirectory(decibel) if (BUILD_FFTS) add_subdirectory(fft) endif() -add_subdirectory(wavelets) +add_subdirectory(wavelet) collect_headers(DALI_INST_HDRS PARENT_SCOPE) collect_sources(DALI_OPERATOR_SRCS PARENT_SCOPE) diff --git a/dali/operators/signal/wavelets/CMakeLists.txt b/dali/operators/signal/wavelet/CMakeLists.txt similarity index 100% rename from dali/operators/signal/wavelets/CMakeLists.txt rename to dali/operators/signal/wavelet/CMakeLists.txt diff --git a/dali/operators/signal/wavelets/cwt_op.h b/dali/operators/signal/wavelet/cwt_op.h similarity index 100% rename from dali/operators/signal/wavelets/cwt_op.h rename to dali/operators/signal/wavelet/cwt_op.h diff --git a/dali/operators/signal/wavelets/cwt_op_gpu.cu b/dali/operators/signal/wavelet/cwt_op_gpu.cu similarity index 100% rename from dali/operators/signal/wavelets/cwt_op_gpu.cu rename to dali/operators/signal/wavelet/cwt_op_gpu.cu